In [153]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [132]:
# Importing a dataset
data =  pd.read_csv('data/for_model.csv', index_col = [0])
y = pd.read_csv("data/for_model_y.csv", index_col=[0])

In [133]:
data.head()

Unnamed: 0,location,location_population,occupation,hobbies,friends_number,sex,credit_card_type,relationship_status,education,dob,daily_commute
0,Chrzanów,37123,Numerical and material recording clerks,"Stretching,Cooking,Parkour",268,1.0,Visa,In relationship,3.0,34.0,32.0
1,Konstancin-Jeziorna,17191,Production and specialised services managers,"Squash,Cardio,Astronomy",204,0.0,Mastercard,In relationship,3.0,36.0,5.0
2,Stalowa Wola,61903,Personal care workers,"Cardio,Power bike,Wood carving,Acting,Rappelling",128,1.0,Visa,In relationship,3.0,23.0,18.0
3,Braniewo,17074,"Armed forces occupations, other ranks","Bodybuilding,Fitness,Skimboarding",156,1.0,Visa,Married with kids,5.0,56.0,76.0
4,Sieniawa,2146,General and keyboard clerks,"Cardio,Rowing,Gym,Dowsing",182,0.0,Visa,In relationship,5.0,37.0,45.0


In [134]:
y.head()

Unnamed: 0,target
2478,0
3531,1
2561,1
1321,0
2405,1


We can see that the data is less because we are only working on training data

In [135]:
NA = pd.DataFrame(data=[data.notna().sum().tolist(),
            data.isna().sum().tolist(), ["{:.2f}".format(i)+'%' \
           for i in (data.isna().sum()/data.shape[0]*100).tolist()]], 
           columns=data.columns, index=['Not NA Count', 'NA Count', 'NA Percent']).transpose()
NA

Unnamed: 0,Not NA Count,NA Count,NA Percent
location,2800,0,0.00%
location_population,2800,0,0.00%
occupation,2800,0,0.00%
hobbies,2322,478,17.07%
friends_number,2800,0,0.00%
sex,2800,0,0.00%
credit_card_type,2800,0,0.00%
relationship_status,2800,0,0.00%
education,2800,0,0.00%
dob,2800,0,0.00%


In [136]:
hobby_columns = data['hobbies'].str.get_dummies(',') # Save hobbies column as one hot encorder

# Łączenie nowych kolumn z oryginalnym dataframe
df_encoded = pd.concat([data, hobby_columns], axis=1)

# Wyświetlanie zdekodowanego dataframe
df_encoded.head()

Unnamed: 0,location,location_population,occupation,hobbies,friends_number,sex,credit_card_type,relationship_status,education,dob,...,Wood carving,Woodworking,Worldbuilding,Writing,Yo-yoing,Yoga,amateur radio,role-playing games,scrapbook,tabletop games
0,Chrzanów,37123,Numerical and material recording clerks,"Stretching,Cooking,Parkour",268,1.0,Visa,In relationship,3.0,34.0,...,0,0,0,0,0,0,0,0,0,0
1,Konstancin-Jeziorna,17191,Production and specialised services managers,"Squash,Cardio,Astronomy",204,0.0,Mastercard,In relationship,3.0,36.0,...,0,0,0,0,0,0,0,0,0,0
2,Stalowa Wola,61903,Personal care workers,"Cardio,Power bike,Wood carving,Acting,Rappelling",128,1.0,Visa,In relationship,3.0,23.0,...,1,0,0,0,0,0,0,0,0,0
3,Braniewo,17074,"Armed forces occupations, other ranks","Bodybuilding,Fitness,Skimboarding",156,1.0,Visa,Married with kids,5.0,56.0,...,0,0,0,0,0,0,0,0,0,0
4,Sieniawa,2146,General and keyboard clerks,"Cardio,Rowing,Gym,Dowsing",182,0.0,Visa,In relationship,5.0,37.0,...,0,0,0,0,0,0,0,0,0,0


There are 185 different hobbies in the data we have

In [137]:
work = hobby_columns.sum().sort_values(ascending=False)

In [138]:
type(work)

pandas.core.series.Series

In [139]:
work.head(20)

Gym                 183
Squash              183
Bodybuilding        178
Rowing              171
Cardio              161
Weightlifting       158
Badminton           149
Crossfit            147
Fitness             145
Stretching          143
Power bike          143
Kettleballs         135
Parkour              40
Poi                  39
Sculpting            39
Skydiving            36
Drama                36
Book restoration     36
Taxidermy            35
Geocaching           35
dtype: int64

We only choose the hobbies that are most common (greater than 100)

In [140]:
result = work[work > 100]

In [141]:
result.head(20)

Gym              183
Squash           183
Bodybuilding     178
Rowing           171
Cardio           161
Weightlifting    158
Badminton        149
Crossfit         147
Fitness          145
Stretching       143
Power bike       143
Kettleballs      135
dtype: int64

In [142]:
part_hobbies = hobby_columns[result.index]

In [143]:
part_hobbies

Unnamed: 0,Gym,Squash,Bodybuilding,Rowing,Cardio,Weightlifting,Badminton,Crossfit,Fitness,Stretching,Power bike,Kettleballs
0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,1,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,1,0
3,0,0,1,0,0,0,0,0,1,0,0,0
4,1,0,0,1,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2795,0,0,0,0,0,0,0,0,0,0,0,0
2796,0,0,0,0,0,0,0,0,0,0,0,0
2797,0,0,0,0,0,0,0,0,0,0,0,0
2798,0,0,0,0,0,0,0,0,0,0,0,0


In [144]:
hot_credit_card_type = data['credit_card_type'].str.get_dummies()
hot_credit_card_type

Unnamed: 0,American Express,Mastercard,Revolut,Visa
0,0,0,0,1
1,0,1,0,0
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1
...,...,...,...,...
2795,0,1,0,0
2796,0,0,0,1
2797,0,1,0,0
2798,0,0,0,1


In [145]:
hot_credit_card_type.sum()

American Express      23
Mastercard          1136
Revolut              108
Visa                1533
dtype: int64

In [146]:
hot_relationship_status = data['relationship_status'].str.get_dummies()
hot_relationship_status

Unnamed: 0,Divorced,In relationship,Married,Married with kids,Single
0,0,1,0,0,0
1,0,1,0,0,0
2,0,1,0,0,0
3,0,0,0,1,0
4,0,1,0,0,0
...,...,...,...,...,...
2795,0,0,0,0,1
2796,0,1,0,0,0
2797,0,0,0,0,1
2798,0,0,0,1,0


In [147]:
hot_relationship_status.sum()

Divorced              145
In relationship      1172
Married               573
Married with kids     495
Single                415
dtype: int64

In [148]:
df_main = pd.concat([data, part_hobbies], axis=1)
df_main = pd.concat([df_main, hot_relationship_status], axis=1)
df_main = pd.concat([df_main, hot_credit_card_type], axis=1)

In [149]:
df_main = df_main.drop(columns=['location', 'occupation', 'credit_card_type', 'hobbies', 'relationship_status'])

Czemy w pierwszym wierszu występują wartości NaN?

In [150]:
df_main

Unnamed: 0,location_population,friends_number,sex,education,dob,daily_commute,Gym,Squash,Bodybuilding,Rowing,...,Kettleballs,Divorced,In relationship,Married,Married with kids,Single,American Express,Mastercard,Revolut,Visa
0,37123,268,1.0,3.0,34.000000,32.0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,17191,204,0.0,3.0,36.000000,5.0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
2,61903,128,1.0,3.0,23.000000,18.0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,17074,156,1.0,5.0,56.000000,76.0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
4,2146,182,0.0,5.0,37.000000,45.0,1,0,0,1,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2795,1764615,203,1.0,3.0,55.000000,63.0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2796,111752,178,0.0,5.0,70.000000,90.0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2797,109021,226,0.0,3.0,36.000000,62.0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2798,638586,216,0.0,3.0,45.265734,91.0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1


In [156]:
X_train, X_test, y_train, y_test = train_test_split(df_main, y, test_size = 0.25)

In [157]:
# Splitting the data into training and testing data
regr = LinearRegression()
 
regr.fit(X_train, y_train)
print(regr.score(X_test, y_test))

0.27051701954923457
