In [131]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import pickle

In [132]:
# Importing a dataset
data =  pd.read_csv('data/for_model.csv', index_col = [0])
y = pd.read_csv("data/for_model_y.csv", index_col = [0])

In [133]:
data.head()

Unnamed: 0,location,location_population,occupation,hobbies,friends_number,sex,credit_card_type,relationship_status,education,dob,daily_commute
0,Chrzanów,37123,Numerical and material recording clerks,"Stretching,Cooking,Parkour",268,1.0,Visa,In relationship,3.0,34.0,32.0
1,Konstancin-Jeziorna,17191,Production and specialised services managers,"Squash,Cardio,Astronomy",204,0.0,Mastercard,In relationship,3.0,36.0,5.0
2,Stalowa Wola,61903,Personal care workers,"Cardio,Power bike,Wood carving,Acting,Rappelling",128,1.0,Visa,In relationship,3.0,23.0,18.0
3,Braniewo,17074,"Armed forces occupations, other ranks","Bodybuilding,Fitness,Skimboarding",156,1.0,Visa,Married with kids,5.0,56.0,76.0
4,Sieniawa,2146,General and keyboard clerks,"Cardio,Rowing,Gym,Dowsing",182,0.0,Visa,In relationship,5.0,37.0,45.0


We can see that the data is less because we are only working on training data

In [134]:
NA = pd.DataFrame(data=[data.notna().sum().tolist(),
            data.isna().sum().tolist(), ["{:.2f}".format(i)+'%' \
           for i in (data.isna().sum()/data.shape[0]*100).tolist()]], 
           columns=data.columns, index=['Not NA Count', 'NA Count', 'NA Percent']).transpose()
NA

Unnamed: 0,Not NA Count,NA Count,NA Percent
location,2800,0,0.00%
location_population,2800,0,0.00%
occupation,2800,0,0.00%
hobbies,2322,478,17.07%
friends_number,2800,0,0.00%
sex,2800,0,0.00%
credit_card_type,2800,0,0.00%
relationship_status,2800,0,0.00%
education,2800,0,0.00%
dob,2800,0,0.00%


Data that does not have missing values

In [135]:
data_no_na = data.dropna()
data_no_na.count()

location               2322
location_population    2322
occupation             2322
hobbies                2322
friends_number         2322
sex                    2322
credit_card_type       2322
relationship_status    2322
education              2322
dob                    2322
daily_commute          2322
dtype: int64

In [136]:
data_is_na = data[data.isna().any(axis=1)]
data_is_na

Unnamed: 0,location,location_population,occupation,hobbies,friends_number,sex,credit_card_type,relationship_status,education,dob,daily_commute
10,Hajnówka,20919,"Armed forces occupations, other ranks",,254,1.0,American Express,Married with kids,5.0,68.0,64.000000
11,Jastrzębie-Zdrój,89590,"Legal, social and cultural professionals",,341,1.0,Visa,Single,5.0,25.0,38.000000
12,Jastrzębie-Zdrój,89590,"Agricultural, forestry and fishery labourers",,229,1.0,Mastercard,Married,2.0,53.0,37.000000
13,Jaworzno,92090,Commissioned armed forces officers,,244,0.0,Visa,In relationship,5.0,27.0,21.000000
34,Bytom,168394,Other clerical support workers,,267,1.0,Visa,Married,2.0,57.0,88.000000
...,...,...,...,...,...,...,...,...,...,...,...
2760,Warszawa,1764615,"Legal, social, cultural and related associate ...",,188,0.0,Visa,Single,2.0,22.0,5.000000
2776,Włocławek,111752,"Food processing, wood working, garment and oth...",,307,1.0,Mastercard,In relationship,3.0,49.0,50.704293
2782,Tarnów,109650,Non-commissioned armed forces officers,,296,1.0,Visa,Divorced,4.0,78.0,46.838235
2787,Kraków,767348,Market-oriented skilled agricultural workers,,213,1.0,Mastercard,Married,1.0,68.0,50.000000


In [137]:
hobby_columns = data_no_na['hobbies'].str.get_dummies(',') # Save hobbies column as one hot encorder
hobby_columns.head()

Unnamed: 0,3D printing,Acting,Air sports,Amateur radio,Archery,Astronomy,BASE jumping,Backpacking,Badminton,Baseball,...,Wood carving,Woodworking,Worldbuilding,Writing,Yo-yoing,Yoga,amateur radio,role-playing games,scrapbook,tabletop games
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


There are 174 different hobbies in the data we have

We examine what hobbies occur most frequently

In [138]:
hobb = hobby_columns.sum().sort_values(ascending=False)
# hobb.head(12)

We only choose the hobbies that are most common (greater than 100)

In [139]:
hobb_100 = hobb[hobb > 100]

In [140]:
# hobb_100.head(20)

In [141]:
part_hobbies = hobby_columns[hobb_100.index]
part_hobbies.head()

Unnamed: 0,Gym,Squash,Bodybuilding,Rowing,Cardio,Weightlifting,Badminton,Crossfit,Fitness,Stretching,Power bike,Kettleballs
0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,1,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,1,0
3,0,0,1,0,0,0,0,0,1,0,0,0
4,1,0,0,1,1,0,0,0,0,0,0,0


In [142]:
# hot_credit_card_type = data_no_na['credit_card_type'].str.get_dummies() # Application of one hot encoding
# hot_credit_card_type = hot_credit_card_type.drop(hot_credit_card_type.columns[0], axis=1) # reduction in the number of columns
# hot_credit_card_type.head()

In [143]:
# hot_credit_card_type.sum()

In [144]:
# hot_relationship_status = data_no_na['relationship_status'].str.get_dummies() # Application of one hot encoding
# hot_relationship_status = hot_relationship_status.drop(hot_relationship_status.columns[0], axis=1) # reduction in the number of columns
# hot_relationship_status.head()

In [145]:
# hot_relationship_status.sum()

Combining fransformed data into a single table

In [146]:
# df_main = pd.concat([data_no_na, part_hobbies], axis=1)
# df_main = pd.concat([df_main, hot_relationship_status], axis=1)
# df_main = pd.concat([df_main, hot_credit_card_type], axis=1)

Removing unnecessary columns

In [147]:
# df_main = df_main.drop(columns=['location', 'occupation', 'credit_card_type', 'hobbies', 'relationship_status'])

In [148]:
# df_main.head()

In [149]:
# df_m = df_main.drop(columns=part_hobbies.columns)

In [150]:
# df_m.head()

In [151]:
# df_m.count()

In [152]:
X_train, X_test, y_train, y_test = train_test_split(data_no_na, part_hobbies, test_size = 0.25)

Displaying the training set

In [153]:
y_train.head()

Unnamed: 0,Gym,Squash,Bodybuilding,Rowing,Cardio,Weightlifting,Badminton,Crossfit,Fitness,Stretching,Power bike,Kettleballs
2785,0,0,0,0,0,0,0,0,0,0,0,0
1348,0,0,1,0,0,0,0,0,1,0,0,0
1016,0,0,0,0,0,0,0,0,0,0,0,0
1889,0,0,0,0,0,0,0,0,0,0,0,0
486,0,0,0,0,0,0,0,0,0,0,0,0


In [154]:
y_train.columns

Index(['Gym', 'Squash', 'Bodybuilding', 'Rowing', 'Cardio', 'Weightlifting',
       'Badminton', 'Crossfit', 'Fitness', 'Stretching', 'Power bike',
       'Kettleballs'],
      dtype='object')

In [155]:
y_train.sum()

Gym              139
Squash           138
Bodybuilding     134
Rowing           129
Cardio           121
Weightlifting    113
Badminton        108
Crossfit         105
Fitness          102
Stretching       107
Power bike       103
Kettleballs      103
dtype: int64

In [156]:
# hot_credit_card_type_na = data_is_na['credit_card_type'].str.get_dummies()
# hot_credit_card_type_na = hot_credit_card_type_na.drop(hot_credit_card_type_na.columns[0], axis=1)

# hot_relationship_status_na = data_is_na['relationship_status'].str.get_dummies()
# hot_relationship_status_na = hot_relationship_status_na.drop(hot_relationship_status_na.columns[0], axis=1)

# # df_main_na = pd.concat([data_is_na, part_hobbies_na], axis=1)
# df_main_na = pd.concat([data_is_na, hot_relationship_status_na], axis=1)
# df_main_na = pd.concat([df_main_na, hot_credit_card_type_na], axis=1)

# df_main_na = df_main_na.drop(columns=['location', 'occupation', 'credit_card_type', 'hobbies', 'relationship_status'])

In [157]:
# t_X_train = X_train[X_train.isna().any(axis=1)]
# n_X_train = X_train.dropna()

def transform_function(df, n):
    hobby_columns = df['hobbies'].str.get_dummies(',') # Save hobbies column as one hot encorder
    hobb = hobby_columns.sum().sort_values(ascending=False)
    hobb_100 = hobb[hobb > 100]
    part_hobbies = hobby_columns[hobb_100.index]

    hot_credit_card_type = df['credit_card_type'].str.get_dummies() # Application of one hot encoding
    hot_credit_card_type = hot_credit_card_type.drop(hot_credit_card_type.columns[0], axis=1) # reduction in the number of columns

    hot_relationship_status = df['relationship_status'].str.get_dummies() # Application of one hot encoding
    hot_relationship_status = hot_relationship_status.drop(hot_relationship_status.columns[0], axis=1) # reduction in the number of columns

    df = pd.concat([df, part_hobbies], axis=1)
    df = pd.concat([df, hot_relationship_status], axis=1)
    df = pd.concat([df, hot_credit_card_type], axis=1)

    df = df.drop(columns=['location', 'occupation', 'credit_card_type', 'hobbies', 'relationship_status'])

    if n == True:
        df = df.drop(columns=part_hobbies.columns)

    return df

In [158]:
X_train

Unnamed: 0,location,location_population,occupation,hobbies,friends_number,sex,credit_card_type,relationship_status,education,dob,daily_commute
2785,Kielce,196804,Personal care workers,Shooting,81,1.0,Mastercard,In relationship,3.0,24.000000,19.000000
1348,Ostróda,33243,Administrative and commercial managers,"Fitness,Bodybuilding,Yoga,Sand art",234,1.0,Mastercard,Single,2.0,47.000000,43.000000
1016,Łódź,690422,"Armed forces occupations, other ranks",Ice skating,116,0.0,Mastercard,Married,5.0,55.000000,98.000000
1889,Grudziądz,95629,Non-commissioned armed forces officers,"Coffee roasting,Paintball,Glassblowing",297,1.0,Mastercard,Married with kids,4.0,46.000000,24.000000
486,Kraków,767348,"Armed forces occupations, other ranks","Water sports,Poi,Jogging",158,0.0,Mastercard,In relationship,4.0,37.000000,37.000000
...,...,...,...,...,...,...,...,...,...,...,...
357,Knurów,38594,"Hospitality, retail and other services managers","Rowing,Slacklining,Jigsaw puzzles",152,1.0,Mastercard,In relationship,5.0,45.672000,58.000000
1605,Jaworzno,92090,Assemblers,"Gym,Candle making,Painting",164,1.0,American Express,Single,2.0,28.000000,61.000000
334,Warszawa,1764615,"Building and related trades workers, excluding...",Sculpting,178,1.0,Mastercard,Married,4.0,61.000000,93.000000
2286,Ożarów,4594,Science and engineering professionals,"Squash,Flying disc",143,0.0,Mastercard,In relationship,4.0,43.698598,49.673516


In [159]:
X_train_trans = transform_function(X_train, True)

In [160]:
X_train_trans

Unnamed: 0,location_population,friends_number,sex,education,dob,daily_commute,In relationship,Married,Married with kids,Single,Mastercard,Revolut,Visa
2785,196804,81,1.0,3.0,24.000000,19.000000,1,0,0,0,1,0,0
1348,33243,234,1.0,2.0,47.000000,43.000000,0,0,0,1,1,0,0
1016,690422,116,0.0,5.0,55.000000,98.000000,0,1,0,0,1,0,0
1889,95629,297,1.0,4.0,46.000000,24.000000,0,0,1,0,1,0,0
486,767348,158,0.0,4.0,37.000000,37.000000,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,38594,152,1.0,5.0,45.672000,58.000000,1,0,0,0,1,0,0
1605,92090,164,1.0,2.0,28.000000,61.000000,0,0,0,1,0,0,0
334,1764615,178,1.0,4.0,61.000000,93.000000,0,1,0,0,1,0,0
2286,4594,143,0.0,4.0,43.698598,49.673516,1,0,0,0,1,0,0


In [161]:
# df_main_na.head()

Training the model on the training group and then applying the model to the data for the group with barracking data. -> creating a df with the results  
Each column has its own Logistic regression model

In [162]:
# n_X_train.count()

In [163]:
X_train_trans

Unnamed: 0,location_population,friends_number,sex,education,dob,daily_commute,In relationship,Married,Married with kids,Single,Mastercard,Revolut,Visa
2785,196804,81,1.0,3.0,24.000000,19.000000,1,0,0,0,1,0,0
1348,33243,234,1.0,2.0,47.000000,43.000000,0,0,0,1,1,0,0
1016,690422,116,0.0,5.0,55.000000,98.000000,0,1,0,0,1,0,0
1889,95629,297,1.0,4.0,46.000000,24.000000,0,0,1,0,1,0,0
486,767348,158,0.0,4.0,37.000000,37.000000,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,38594,152,1.0,5.0,45.672000,58.000000,1,0,0,0,1,0,0
1605,92090,164,1.0,2.0,28.000000,61.000000,0,0,0,1,0,0,0
334,1764615,178,1.0,4.0,61.000000,93.000000,0,1,0,0,1,0,0
2286,4594,143,0.0,4.0,43.698598,49.673516,1,0,0,0,1,0,0


In [164]:
y_train[hobby].values

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

Tworzenie modeli

In [165]:
trained_models = {} # Dict with models
# new_df_m = pd.DataFrame() # df with predictions

for hobby in y_train.columns:
    model = LogisticRegression()
    model.fit(X_train_trans, np.array(y_train[hobby].values))
    trained_models[hobby] = model # save trained models to dict

Używanie modeli

In [166]:
pred_hobby = pd.DataFrame()

for hobby, model in trained_models.items():
    pred = model.predict(X_train_trans)
    pred_hobby[hobby] = pred


Przewidywania tragiczne 

In [174]:
pred_hobby.sum()

Gym              0
Squash           0
Bodybuilding     0
Rowing           0
Cardio           0
Weightlifting    0
Badminton        0
Crossfit         0
Fitness          0
Stretching       0
Power bike       0
Kettleballs      0
dtype: int64

In [175]:
y_train.sum()

Gym              139
Squash           138
Bodybuilding     134
Rowing           129
Cardio           121
Weightlifting    113
Badminton        108
Crossfit         105
Fitness          102
Stretching       107
Power bike       103
Kettleballs      103
dtype: int64

In [168]:
# trained_models = {} # Dict with models
# new_df_m = pd.DataFrame() # df with predictions

# for hobby in y_train.columns:
#     model = LogisticRegression()
#     model.fit(X_train_trans, np.array(y_train[hobby].values))
#     trained_models[hobby] = model # save trained models to dict
#     pred = model.predict(X_train_trans)
#     new_df_m[hobby] = pred

In [169]:
trained_models

{'Gym': LogisticRegression(),
 'Squash': LogisticRegression(),
 'Bodybuilding': LogisticRegression(),
 'Rowing': LogisticRegression(),
 'Cardio': LogisticRegression(),
 'Weightlifting': LogisticRegression(),
 'Badminton': LogisticRegression(),
 'Crossfit': LogisticRegression(),
 'Fitness': LogisticRegression(),
 'Stretching': LogisticRegression(),
 'Power bike': LogisticRegression(),
 'Kettleballs': LogisticRegression()}

In [170]:
# pred_hobby = pd.DataFrame()

# for hobby, model in trained_models.items():
#     pred = model.predict(X_train_trans)
#     pred_hobby[hobby] = pred

Saving models to a file in piccle format

In [172]:
t_X_train

Unnamed: 0,location_population,friends_number,sex,education,dob,daily_commute,In relationship,Married,Married with kids,Single,Mastercard,Revolut,Visa


Czemu wynik jest tak wysoki??

In [176]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_train, pred_hobby)
print("Prediction precision:", accuracy)


Prediction precision: 0.44227455485353245


In [None]:
with open('models/all_trained_hobby_models.pkl', 'wb') as f:
    pickle.dump(trained_models, f)


Sprawdzanie ilości wiersz w uzyskanym df

In [None]:
new_df_m.count()

Niestety w uzyskanym df wszystkie wiersze mają wynik 0

In [None]:
new_df_m.sum()

In [None]:
new_df_m.head()

In [None]:
df_main_na.head()

In [None]:
df_pred = df_main_na.copy()
df_pred.reset_index(inplace=True)
df_pred[new_df_m.columns] = new_df_m
df_pred.head()

In [None]:
df_pred_all = pd.concat([df_pred, df_main], axis=0)
df_pred_all.drop(columns=['index'], inplace=True)
df_pred_all