In [1]:
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("synthetic_loss_data.csv")

In [3]:
df.head()

Unnamed: 0,Cultural_Background,Religious_Affiliation,Level_of_Education,Family_Income_Level,Gender,Urban_or_Rural,Peer_Group_Influence,Education,Parenting_Style,Tech_Media_Exposure,Age_of_Loss
0,African,Highly Religious,College,Low,Non-binary,Rural,High,No,Authoritarian,High,21.0
1,Middle Eastern,,High School,High,Male,Rural,High,Yes,Permissive,Medium,21.0
2,Western,,High School,Low,Non-binary,Urban,High,Yes,Authoritarian,High,14.0
3,African,Highly Religious,College,Middle,Female,Urban,Moderate,Yes,Authoritarian,Medium,20.0
4,African,,College,Middle,Non-binary,Rural,Low,No,Permissive,Low,20.0


In [4]:
column_maps = {}
for column_name in df.columns:
    if column_name == 'Age_of_Loss':
        continue
    uniq = df[column_name].unique()
    column_maps[column_name] = {uniq[ind]: ind+1 for ind, val in enumerate(uniq)}
    

In [5]:
column_maps

{'Cultural_Background': {'African': 1,
  'Middle Eastern': 2,
  'Western': 3,
  'Asian': 4},
 'Religious_Affiliation': {'Highly Religious': 1, nan: 2, 'Moderate': 3},
 'Level_of_Education': {'College': 1, 'High School': 2, 'Advanced Degree': 3},
 'Family_Income_Level': {'Low': 1, 'High': 2, 'Middle': 3},
 'Gender': {'Non-binary': 1, 'Male': 2, 'Female': 3},
 'Urban_or_Rural': {'Rural': 1, 'Urban': 2},
 'Peer_Group_Influence': {'High': 1, 'Moderate': 2, 'Low': 3},
 'Education': {'No': 1, 'Yes': 2},
 'Parenting_Style': {'Authoritarian': 1, 'Permissive': 2, 'Neglectful': 3},
 'Tech_Media_Exposure': {'High': 1, 'Medium': 2, 'Low': 3}}

In [6]:
for column in df:
    if column == 'Age_of_Loss':
        continue
    df[column] = df[column].map(column_maps[column])

In [7]:
df

Unnamed: 0,Cultural_Background,Religious_Affiliation,Level_of_Education,Family_Income_Level,Gender,Urban_or_Rural,Peer_Group_Influence,Education,Parenting_Style,Tech_Media_Exposure,Age_of_Loss
0,1,1,1,1,1,1,1,1,1,1,21.0
1,2,2,2,2,2,1,1,2,2,2,21.0
2,3,2,2,1,1,2,1,2,1,1,14.0
3,1,1,1,3,3,2,2,2,1,2,20.0
4,1,2,1,3,1,1,3,1,2,3,20.0
...,...,...,...,...,...,...,...,...,...,...,...
495,1,2,3,2,2,2,1,2,3,2,17.0
496,1,2,2,1,3,1,2,1,3,1,20.0
497,2,2,3,3,2,2,2,2,1,2,15.0
498,3,3,2,1,1,2,3,2,2,1,20.0


In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(df.loc[:, 'Cultural_Background':'Tech_Media_Exposure'], df['Age_of_Loss'], test_size=0.2)

In [9]:
X_train

Unnamed: 0,Cultural_Background,Religious_Affiliation,Level_of_Education,Family_Income_Level,Gender,Urban_or_Rural,Peer_Group_Influence,Education,Parenting_Style,Tech_Media_Exposure
494,3,1,1,2,3,1,2,2,1,2
385,2,2,3,1,3,1,1,1,2,2
381,4,3,1,1,1,2,3,1,2,1
174,2,1,1,1,2,1,1,1,2,1
42,4,2,1,3,2,2,3,1,2,3
...,...,...,...,...,...,...,...,...,...,...
280,2,1,3,3,2,1,2,1,1,1
40,1,2,2,2,3,2,2,1,1,2
139,3,3,3,2,3,2,2,2,1,2
134,1,3,2,2,2,2,3,2,3,1


In [10]:
Y_train

494    19.0
385    18.0
381    19.0
174    18.0
42     19.0
       ... 
280    15.0
40     22.0
139    15.0
134    21.0
302    20.0
Name: Age_of_Loss, Length: 400, dtype: float64

In [11]:
lr = LinearRegression()
lr.fit(X_train, Y_train)

In [None]:
prediction = lr.predict(X_test)

In [19]:
prediction

array([17.72885117, 17.50979532, 17.97396641, 18.37347662, 18.15415375,
       18.18167041, 18.39277232, 18.46935073, 17.81279144, 17.86974934,
       18.22207214, 18.39190713, 18.28939685, 18.5350651 , 18.00514326,
       18.23744401, 18.175677  , 18.19617791, 17.95034454, 18.80964467,
       18.69285698, 18.30633701, 17.85106674, 17.94334934, 17.79854607,
       18.71528347, 18.02025588, 18.13162171, 18.08907222, 17.98692378,
       17.4801541 , 18.25494842, 17.9211439 , 17.61295023, 18.67624672,
       17.82104083, 17.77378228, 18.53920024, 17.60678513, 18.01488891,
       18.35140343, 18.27558158, 17.81554404, 17.57524033, 17.60548337,
       18.2825149 , 18.75205475, 17.76764836, 17.97550332, 17.22340686,
       18.3938063 , 17.69265661, 17.88136822, 18.20698237, 17.98722562,
       17.70585493, 18.49020089, 18.45534315, 17.7737884 , 17.83170857,
       18.13719742, 17.55836464, 18.46762572, 18.34103753, 18.40215526,
       17.78099021, 17.72345216, 18.49354009, 17.58355839, 17.47

In [20]:
prediction.shape

(100,)

In [21]:
Y_test

105    20.0
406    14.0
121    17.0
295    18.0
458    20.0
       ... 
34     19.0
341    19.0
226    16.0
150    17.0
162    21.0
Name: Age_of_Loss, Length: 100, dtype: float64

In [None]:
import numpy as np
absolute_deviation = np.abs(prediction - Y_test)
mean_absolute_error = np.mean(absolute_deviation)

percentage_deviation = (absolute_deviation / Y_test) * 100
mean_absolute_percentage_error = np.mean(percentage_deviation)

In [24]:
mean_absolute_percentage_error

np.float64(11.393984333856075)