In [128]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import pickle

In [88]:
# Importing a dataset
data =  pd.read_csv('data/for_model.csv', index_col = [0])
y = pd.read_csv("data/for_model_y.csv", index_col = [0])

In [89]:
data.head()

Unnamed: 0,location,location_population,occupation,hobbies,friends_number,sex,credit_card_type,relationship_status,education,dob,daily_commute
0,Chrzanów,37123,Numerical and material recording clerks,"Stretching,Cooking,Parkour",268,1.0,Visa,In relationship,3.0,34.0,32.0
1,Konstancin-Jeziorna,17191,Production and specialised services managers,"Squash,Cardio,Astronomy",204,0.0,Mastercard,In relationship,3.0,36.0,5.0
2,Stalowa Wola,61903,Personal care workers,"Cardio,Power bike,Wood carving,Acting,Rappelling",128,1.0,Visa,In relationship,3.0,23.0,18.0
3,Braniewo,17074,"Armed forces occupations, other ranks","Bodybuilding,Fitness,Skimboarding",156,1.0,Visa,Married with kids,5.0,56.0,76.0
4,Sieniawa,2146,General and keyboard clerks,"Cardio,Rowing,Gym,Dowsing",182,0.0,Visa,In relationship,5.0,37.0,45.0


We can see that the data is less because we are only working on training data

In [90]:
NA = pd.DataFrame(data=[data.notna().sum().tolist(),
            data.isna().sum().tolist(), ["{:.2f}".format(i)+'%' \
           for i in (data.isna().sum()/data.shape[0]*100).tolist()]], 
           columns=data.columns, index=['Not NA Count', 'NA Count', 'NA Percent']).transpose()
NA

Unnamed: 0,Not NA Count,NA Count,NA Percent
location,2800,0,0.00%
location_population,2800,0,0.00%
occupation,2800,0,0.00%
hobbies,2322,478,17.07%
friends_number,2800,0,0.00%
sex,2800,0,0.00%
credit_card_type,2800,0,0.00%
relationship_status,2800,0,0.00%
education,2800,0,0.00%
dob,2800,0,0.00%


Data that does not have missing values

In [91]:
data_no_na = data.dropna()
data_no_na.count()

location               2322
location_population    2322
occupation             2322
hobbies                2322
friends_number         2322
sex                    2322
credit_card_type       2322
relationship_status    2322
education              2322
dob                    2322
daily_commute          2322
dtype: int64

In [92]:
data_is_na = data[data.isna().any(axis=1)]
data_is_na

Unnamed: 0,location,location_population,occupation,hobbies,friends_number,sex,credit_card_type,relationship_status,education,dob,daily_commute
10,Hajnówka,20919,"Armed forces occupations, other ranks",,254,1.0,American Express,Married with kids,5.0,68.0,64.000000
11,Jastrzębie-Zdrój,89590,"Legal, social and cultural professionals",,341,1.0,Visa,Single,5.0,25.0,38.000000
12,Jastrzębie-Zdrój,89590,"Agricultural, forestry and fishery labourers",,229,1.0,Mastercard,Married,2.0,53.0,37.000000
13,Jaworzno,92090,Commissioned armed forces officers,,244,0.0,Visa,In relationship,5.0,27.0,21.000000
34,Bytom,168394,Other clerical support workers,,267,1.0,Visa,Married,2.0,57.0,88.000000
...,...,...,...,...,...,...,...,...,...,...,...
2760,Warszawa,1764615,"Legal, social, cultural and related associate ...",,188,0.0,Visa,Single,2.0,22.0,5.000000
2776,Włocławek,111752,"Food processing, wood working, garment and oth...",,307,1.0,Mastercard,In relationship,3.0,49.0,50.704293
2782,Tarnów,109650,Non-commissioned armed forces officers,,296,1.0,Visa,Divorced,4.0,78.0,46.838235
2787,Kraków,767348,Market-oriented skilled agricultural workers,,213,1.0,Mastercard,Married,1.0,68.0,50.000000


In [93]:
hobby_columns = data_no_na['hobbies'].str.get_dummies(',') # Save hobbies column as one hot encorder
hobby_columns.head()

Unnamed: 0,3D printing,Acting,Air sports,Amateur radio,Archery,Astronomy,BASE jumping,Backpacking,Badminton,Baseball,...,Wood carving,Woodworking,Worldbuilding,Writing,Yo-yoing,Yoga,amateur radio,role-playing games,scrapbook,tabletop games
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


There are 174 different hobbies in the data we have

We examine what hobbies occur most frequently

In [94]:
hobb = hobby_columns.sum().sort_values(ascending=False)
hobb.head(20)

Gym                 183
Squash              183
Bodybuilding        178
Rowing              171
Cardio              161
Weightlifting       158
Badminton           149
Crossfit            147
Fitness             145
Stretching          143
Power bike          143
Kettleballs         135
Parkour              40
Poi                  39
Sculpting            39
Skydiving            36
Drama                36
Book restoration     36
Taxidermy            35
Geocaching           35
dtype: int64

We only choose the hobbies that are most common (greater than 100)

In [95]:
hobb_100 = hobb[hobb > 100]

In [96]:
hobb_100.head(20)

Gym              183
Squash           183
Bodybuilding     178
Rowing           171
Cardio           161
Weightlifting    158
Badminton        149
Crossfit         147
Fitness          145
Stretching       143
Power bike       143
Kettleballs      135
dtype: int64

In [97]:
part_hobbies = hobby_columns[hobb_100.index]
part_hobbies.head()

Unnamed: 0,Gym,Squash,Bodybuilding,Rowing,Cardio,Weightlifting,Badminton,Crossfit,Fitness,Stretching,Power bike,Kettleballs
0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,1,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,1,0
3,0,0,1,0,0,0,0,0,1,0,0,0
4,1,0,0,1,1,0,0,0,0,0,0,0


In [98]:
hot_credit_card_type = data_no_na['credit_card_type'].str.get_dummies() # Application of one hot encoding
hot_credit_card_type = hot_credit_card_type.drop(hot_credit_card_type.columns[0], axis=1) # reduction in the number of columns
hot_credit_card_type.head()

Unnamed: 0,Mastercard,Revolut,Visa
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [99]:
hot_credit_card_type.sum()

Mastercard     929
Revolut         92
Visa          1282
dtype: int64

In [100]:
hot_relationship_status = data_no_na['relationship_status'].str.get_dummies() # Application of one hot encoding
hot_relationship_status = hot_relationship_status.drop(hot_relationship_status.columns[0], axis=1) # reduction in the number of columns
hot_relationship_status

Unnamed: 0,In relationship,Married,Married with kids,Single
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,0,0,1,0
4,1,0,0,0
...,...,...,...,...
2794,1,0,0,0
2796,1,0,0,0
2797,0,0,0,1
2798,0,0,1,0


In [101]:
hot_relationship_status.sum()

In relationship      970
Married              476
Married with kids    396
Single               349
dtype: int64

Combining fransformed data into a single table

In [102]:
df_main = pd.concat([data_no_na, part_hobbies], axis=1)
df_main = pd.concat([df_main, hot_relationship_status], axis=1)
df_main = pd.concat([df_main, hot_credit_card_type], axis=1)

Removing unnecessary columns

In [103]:
df_main = df_main.drop(columns=['location', 'occupation', 'credit_card_type', 'hobbies', 'relationship_status'])

In [104]:
df_main

Unnamed: 0,location_population,friends_number,sex,education,dob,daily_commute,Gym,Squash,Bodybuilding,Rowing,...,Stretching,Power bike,Kettleballs,In relationship,Married,Married with kids,Single,Mastercard,Revolut,Visa
0,37123,268,1.0,3.0,34.000000,32.0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,1
1,17191,204,0.0,3.0,36.000000,5.0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0
2,61903,128,1.0,3.0,23.000000,18.0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,1
3,17074,156,1.0,5.0,56.000000,76.0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,1
4,2146,182,0.0,5.0,37.000000,45.0,1,0,0,1,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2794,403883,254,0.0,3.0,51.000000,81.0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
2796,111752,178,0.0,5.0,70.000000,90.0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2797,109021,226,0.0,3.0,36.000000,62.0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
2798,638586,216,0.0,3.0,45.265734,91.0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [105]:
df_m = df_main.drop(columns=part_hobbies.columns)

In [106]:
df_m

Unnamed: 0,location_population,friends_number,sex,education,dob,daily_commute,In relationship,Married,Married with kids,Single,Mastercard,Revolut,Visa
0,37123,268,1.0,3.0,34.000000,32.0,1,0,0,0,0,0,1
1,17191,204,0.0,3.0,36.000000,5.0,1,0,0,0,1,0,0
2,61903,128,1.0,3.0,23.000000,18.0,1,0,0,0,0,0,1
3,17074,156,1.0,5.0,56.000000,76.0,0,0,1,0,0,0,1
4,2146,182,0.0,5.0,37.000000,45.0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2794,403883,254,0.0,3.0,51.000000,81.0,1,0,0,0,1,0,0
2796,111752,178,0.0,5.0,70.000000,90.0,1,0,0,0,0,0,1
2797,109021,226,0.0,3.0,36.000000,62.0,0,0,0,1,1,0,0
2798,638586,216,0.0,3.0,45.265734,91.0,0,0,1,0,0,0,1


In [107]:
df_m.count()

location_population    2322
friends_number         2322
sex                    2322
education              2322
dob                    2322
daily_commute          2322
In relationship        2322
Married                2322
Married with kids      2322
Single                 2322
Mastercard             2322
Revolut                2322
Visa                   2322
dtype: int64

In [108]:
X_train, X_test, y_train, y_test = train_test_split(df_m, part_hobbies, test_size = 0.25)

Displaying the training set

In [109]:
y_train.head()

Unnamed: 0,Gym,Squash,Bodybuilding,Rowing,Cardio,Weightlifting,Badminton,Crossfit,Fitness,Stretching,Power bike,Kettleballs
1290,0,0,0,0,0,0,0,0,0,0,0,0
1958,0,0,0,0,0,0,0,0,0,0,0,0
2644,0,0,0,0,0,0,0,0,0,0,0,0
1450,0,0,0,0,0,1,0,0,0,0,0,0
443,0,0,0,1,0,0,0,1,0,0,0,0


In [110]:
y_train.columns

Index(['Gym', 'Squash', 'Bodybuilding', 'Rowing', 'Cardio', 'Weightlifting',
       'Badminton', 'Crossfit', 'Fitness', 'Stretching', 'Power bike',
       'Kettleballs'],
      dtype='object')

In [111]:
y_train.sum()

Gym              136
Squash           128
Bodybuilding     126
Rowing           124
Cardio           115
Weightlifting    121
Badminton        115
Crossfit         108
Fitness          110
Stretching       116
Power bike       102
Kettleballs      102
dtype: int64

In [112]:
data_is_na['hobbies']

10      NaN
11      NaN
12      NaN
13      NaN
34      NaN
       ... 
2760    NaN
2776    NaN
2782    NaN
2787    NaN
2795    NaN
Name: hobbies, Length: 478, dtype: object

In [113]:
hot_credit_card_type_na = data_is_na['credit_card_type'].str.get_dummies()
hot_credit_card_type_na = hot_credit_card_type_na.drop(hot_credit_card_type_na.columns[0], axis=1)

hot_relationship_status_na = data_is_na['relationship_status'].str.get_dummies()
hot_relationship_status_na = hot_relationship_status_na.drop(hot_relationship_status_na.columns[0], axis=1)

# df_main_na = pd.concat([data_is_na, part_hobbies_na], axis=1)
df_main_na = pd.concat([data_is_na, hot_relationship_status_na], axis=1)
df_main_na = pd.concat([df_main_na, hot_credit_card_type_na], axis=1)

df_main_na = df_main_na.drop(columns=['location', 'occupation', 'credit_card_type', 'hobbies', 'relationship_status'])



In [114]:
df_main_na

Unnamed: 0,location_population,friends_number,sex,education,dob,daily_commute,In relationship,Married,Married with kids,Single,Mastercard,Revolut,Visa
10,20919,254,1.0,5.0,68.0,64.000000,0,0,1,0,0,0,0
11,89590,341,1.0,5.0,25.0,38.000000,0,0,0,1,0,0,1
12,89590,229,1.0,2.0,53.0,37.000000,0,1,0,0,1,0,0
13,92090,244,0.0,5.0,27.0,21.000000,1,0,0,0,0,0,1
34,168394,267,1.0,2.0,57.0,88.000000,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2760,1764615,188,0.0,2.0,22.0,5.000000,0,0,0,1,0,0,1
2776,111752,307,1.0,3.0,49.0,50.704293,1,0,0,0,1,0,0
2782,109650,296,1.0,4.0,78.0,46.838235,0,0,0,0,0,0,1
2787,767348,213,1.0,1.0,68.0,50.000000,0,1,0,0,1,0,0


In [115]:
new_df = pd.DataFrame()
for hobby in y_train.columns:
    model = LogisticRegression()
    model.fit(X_train, np.array(y_train[hobby].values))
    pred = model.predict(X_train)
    new_df[hobby] = pred
    # print(pred)

Trenowanie modelu na grupie treningowej a następnie aplikacja modelu dla danych dla grupy z barakującymi danymi. -> tworzenie df z wynikami  
Każda kolumna ma swój własny model regresji Logistycznej

In [126]:
trained_models = {}

new_df_m = pd.DataFrame()
for hobby in y_train.columns:
    model = LogisticRegression()
    model.fit(X_train, np.array(y_train[hobby].values))
    trained_models[hobby] = model # save trained models to dict
    pred = model.predict(df_main_na)
    new_df_m[hobby] = pred

In [127]:
trained_models

{'Gym': LogisticRegression(),
 'Squash': LogisticRegression(),
 'Bodybuilding': LogisticRegression(),
 'Rowing': LogisticRegression(),
 'Cardio': LogisticRegression(),
 'Weightlifting': LogisticRegression(),
 'Badminton': LogisticRegression(),
 'Crossfit': LogisticRegression(),
 'Fitness': LogisticRegression(),
 'Stretching': LogisticRegression(),
 'Power bike': LogisticRegression(),
 'Kettleballs': LogisticRegression()}

Saving models to a file in piccle format

In [129]:
with open('models/all_trained_hobby_models.pkl', 'wb') as f:
    pickle.dump(trained_models, f)


Sprawdzanie ilości wiersz w uzyskanym df

In [117]:
new_df_m.count()

Gym              478
Squash           478
Bodybuilding     478
Rowing           478
Cardio           478
Weightlifting    478
Badminton        478
Crossfit         478
Fitness          478
Stretching       478
Power bike       478
Kettleballs      478
dtype: int64

Niestety w uzyskanym df wszystkie wiersze mają wynik 0

In [118]:
new_df_m.sum()

Gym              1
Squash           0
Bodybuilding     0
Rowing           0
Cardio           0
Weightlifting    0
Badminton        0
Crossfit         0
Fitness          0
Stretching       0
Power bike       0
Kettleballs      0
dtype: int64

In [119]:
new_df_m

Unnamed: 0,Gym,Squash,Bodybuilding,Rowing,Cardio,Weightlifting,Badminton,Crossfit,Fitness,Stretching,Power bike,Kettleballs
0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
473,0,0,0,0,0,0,0,0,0,0,0,0
474,0,0,0,0,0,0,0,0,0,0,0,0
475,0,0,0,0,0,0,0,0,0,0,0,0
476,0,0,0,0,0,0,0,0,0,0,0,0


In [120]:
df_main_na

Unnamed: 0,location_population,friends_number,sex,education,dob,daily_commute,In relationship,Married,Married with kids,Single,Mastercard,Revolut,Visa
10,20919,254,1.0,5.0,68.0,64.000000,0,0,1,0,0,0,0
11,89590,341,1.0,5.0,25.0,38.000000,0,0,0,1,0,0,1
12,89590,229,1.0,2.0,53.0,37.000000,0,1,0,0,1,0,0
13,92090,244,0.0,5.0,27.0,21.000000,1,0,0,0,0,0,1
34,168394,267,1.0,2.0,57.0,88.000000,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2760,1764615,188,0.0,2.0,22.0,5.000000,0,0,0,1,0,0,1
2776,111752,307,1.0,3.0,49.0,50.704293,1,0,0,0,1,0,0
2782,109650,296,1.0,4.0,78.0,46.838235,0,0,0,0,0,0,1
2787,767348,213,1.0,1.0,68.0,50.000000,0,1,0,0,1,0,0


In [121]:
df_pred = df_main_na.copy()
df_pred.reset_index(inplace=True)
df_pred[new_df_m.columns] = new_df_m
df_pred

Unnamed: 0,index,location_population,friends_number,sex,education,dob,daily_commute,In relationship,Married,Married with kids,...,Bodybuilding,Rowing,Cardio,Weightlifting,Badminton,Crossfit,Fitness,Stretching,Power bike,Kettleballs
0,10,20919,254,1.0,5.0,68.0,64.000000,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,11,89590,341,1.0,5.0,25.0,38.000000,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,12,89590,229,1.0,2.0,53.0,37.000000,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,13,92090,244,0.0,5.0,27.0,21.000000,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,34,168394,267,1.0,2.0,57.0,88.000000,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473,2760,1764615,188,0.0,2.0,22.0,5.000000,0,0,0,...,0,0,0,0,0,0,0,0,0,0
474,2776,111752,307,1.0,3.0,49.0,50.704293,1,0,0,...,0,0,0,0,0,0,0,0,0,0
475,2782,109650,296,1.0,4.0,78.0,46.838235,0,0,0,...,0,0,0,0,0,0,0,0,0,0
476,2787,767348,213,1.0,1.0,68.0,50.000000,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [122]:
df_pred_all = pd.concat([df_pred, df_main], axis=0)
df_pred_all.drop(columns=['index'], inplace=True)
df_pred_all

Unnamed: 0,location_population,friends_number,sex,education,dob,daily_commute,In relationship,Married,Married with kids,Single,...,Bodybuilding,Rowing,Cardio,Weightlifting,Badminton,Crossfit,Fitness,Stretching,Power bike,Kettleballs
0,20919,254,1.0,5.0,68.000000,64.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,89590,341,1.0,5.0,25.000000,38.0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,89590,229,1.0,2.0,53.000000,37.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,92090,244,0.0,5.0,27.000000,21.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,168394,267,1.0,2.0,57.000000,88.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2794,403883,254,0.0,3.0,51.000000,81.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2796,111752,178,0.0,5.0,70.000000,90.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2797,109021,226,0.0,3.0,36.000000,62.0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2798,638586,216,0.0,3.0,45.265734,91.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [123]:
df_pred_all['target'] = y

In [124]:
df_pred_all.count()

location_population    2800
friends_number         2800
sex                    2800
education              2800
dob                    2800
daily_commute          2800
In relationship        2800
Married                2800
Married with kids      2800
Single                 2800
Mastercard             2800
Revolut                2800
Visa                   2800
Gym                    2800
Squash                 2800
Bodybuilding           2800
Rowing                 2800
Cardio                 2800
Weightlifting          2800
Badminton              2800
Crossfit               2800
Fitness                2800
Stretching             2800
Power bike             2800
Kettleballs            2800
target                 1970
dtype: int64

Saving the completed data in csv format

In [125]:
df_pred_all.to_csv('data/fitted_data.csv')