In [35]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import pickle

In [36]:
# Importing a dataset
data =  pd.read_csv('data/for_model.csv', index_col = [0])
y = pd.read_csv("data/for_model_y.csv", index_col = [0])

In [37]:
data.head()

Unnamed: 0,location,location_population,occupation,hobbies,friends_number,sex,credit_card_type,relationship_status,education,dob,daily_commute
0,Chrzanów,37123,Numerical and material recording clerks,"Stretching,Cooking,Parkour",268,1.0,Visa,In relationship,3.0,34.0,32.0
1,Konstancin-Jeziorna,17191,Production and specialised services managers,"Squash,Cardio,Astronomy",204,0.0,Mastercard,In relationship,3.0,36.0,5.0
2,Stalowa Wola,61903,Personal care workers,"Cardio,Power bike,Wood carving,Acting,Rappelling",128,1.0,Visa,In relationship,3.0,23.0,18.0
3,Braniewo,17074,"Armed forces occupations, other ranks","Bodybuilding,Fitness,Skimboarding",156,1.0,Visa,Married with kids,5.0,56.0,76.0
4,Sieniawa,2146,General and keyboard clerks,"Cardio,Rowing,Gym,Dowsing",182,0.0,Visa,In relationship,5.0,37.0,45.0


We can see that the data is less because we are only working on training data

In [38]:
NA = pd.DataFrame(data=[data.notna().sum().tolist(),
            data.isna().sum().tolist(), ["{:.2f}".format(i)+'%' \
           for i in (data.isna().sum()/data.shape[0]*100).tolist()]], 
           columns=data.columns, index=['Not NA Count', 'NA Count', 'NA Percent']).transpose()
NA

Unnamed: 0,Not NA Count,NA Count,NA Percent
location,2800,0,0.00%
location_population,2800,0,0.00%
occupation,2800,0,0.00%
hobbies,2322,478,17.07%
friends_number,2800,0,0.00%
sex,2800,0,0.00%
credit_card_type,2800,0,0.00%
relationship_status,2800,0,0.00%
education,2800,0,0.00%
dob,2800,0,0.00%


Data that does not have missing values

In [39]:
data_no_na = data.dropna()
data_no_na.count()

location               2322
location_population    2322
occupation             2322
hobbies                2322
friends_number         2322
sex                    2322
credit_card_type       2322
relationship_status    2322
education              2322
dob                    2322
daily_commute          2322
dtype: int64

In [40]:
data_is_na = data[data.isna().any(axis=1)]
data_is_na

Unnamed: 0,location,location_population,occupation,hobbies,friends_number,sex,credit_card_type,relationship_status,education,dob,daily_commute
10,Hajnówka,20919,"Armed forces occupations, other ranks",,254,1.0,American Express,Married with kids,5.0,68.0,64.000000
11,Jastrzębie-Zdrój,89590,"Legal, social and cultural professionals",,341,1.0,Visa,Single,5.0,25.0,38.000000
12,Jastrzębie-Zdrój,89590,"Agricultural, forestry and fishery labourers",,229,1.0,Mastercard,Married,2.0,53.0,37.000000
13,Jaworzno,92090,Commissioned armed forces officers,,244,0.0,Visa,In relationship,5.0,27.0,21.000000
34,Bytom,168394,Other clerical support workers,,267,1.0,Visa,Married,2.0,57.0,88.000000
...,...,...,...,...,...,...,...,...,...,...,...
2760,Warszawa,1764615,"Legal, social, cultural and related associate ...",,188,0.0,Visa,Single,2.0,22.0,5.000000
2776,Włocławek,111752,"Food processing, wood working, garment and oth...",,307,1.0,Mastercard,In relationship,3.0,49.0,50.704293
2782,Tarnów,109650,Non-commissioned armed forces officers,,296,1.0,Visa,Divorced,4.0,78.0,46.838235
2787,Kraków,767348,Market-oriented skilled agricultural workers,,213,1.0,Mastercard,Married,1.0,68.0,50.000000


In [41]:
hobby_columns = data_no_na['hobbies'].str.get_dummies(',') # Save hobbies column as one hot encorder
hobby_columns.head()

Unnamed: 0,3D printing,Acting,Air sports,Amateur radio,Archery,Astronomy,BASE jumping,Backpacking,Badminton,Baseball,...,Wood carving,Woodworking,Worldbuilding,Writing,Yo-yoing,Yoga,amateur radio,role-playing games,scrapbook,tabletop games
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


There are 174 different hobbies in the data we have

We examine what hobbies occur most frequently

In [42]:
hobb = hobby_columns.sum().sort_values(ascending=False)
# hobb.head(12)

We only choose the hobbies that are most common (greater than 100)

In [43]:
hobb_100 = hobb[hobb > 100]

In [44]:
# hobb_100.head(20)

In [45]:
part_hobbies = hobby_columns[hobb_100.index]
part_hobbies.head()

Unnamed: 0,Gym,Squash,Bodybuilding,Rowing,Cardio,Weightlifting,Badminton,Crossfit,Fitness,Stretching,Power bike,Kettleballs
0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,1,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,1,0
3,0,0,1,0,0,0,0,0,1,0,0,0
4,1,0,0,1,1,0,0,0,0,0,0,0


In [46]:
# hot_credit_card_type = data_no_na['credit_card_type'].str.get_dummies() # Application of one hot encoding
# hot_credit_card_type = hot_credit_card_type.drop(hot_credit_card_type.columns[0], axis=1) # reduction in the number of columns
# hot_credit_card_type.head()

In [47]:
# hot_credit_card_type.sum()

In [48]:
# hot_relationship_status = data_no_na['relationship_status'].str.get_dummies() # Application of one hot encoding
# hot_relationship_status = hot_relationship_status.drop(hot_relationship_status.columns[0], axis=1) # reduction in the number of columns
# hot_relationship_status.head()

In [49]:
# hot_relationship_status.sum()

Combining fransformed data into a single table

In [50]:
# df_main = pd.concat([data_no_na, part_hobbies], axis=1)
# df_main = pd.concat([df_main, hot_relationship_status], axis=1)
# df_main = pd.concat([df_main, hot_credit_card_type], axis=1)

Removing unnecessary columns

In [51]:
# df_main = df_main.drop(columns=['location', 'occupation', 'credit_card_type', 'hobbies', 'relationship_status'])

In [52]:
# df_main.head()

In [53]:
# df_m = df_main.drop(columns=part_hobbies.columns)

In [54]:
# df_m.head()

In [55]:
# df_m.count()

In [56]:
X_train, X_test, y_train, y_test = train_test_split(data_no_na, part_hobbies, test_size = 0.25)

Displaying the training set

In [57]:
y_train.head()

Unnamed: 0,Gym,Squash,Bodybuilding,Rowing,Cardio,Weightlifting,Badminton,Crossfit,Fitness,Stretching,Power bike,Kettleballs
752,0,0,0,0,0,0,0,0,0,0,0,0
2520,0,0,0,0,0,1,0,0,0,0,0,1
1298,0,0,0,0,0,0,0,0,0,0,0,0
1904,0,0,0,0,0,0,0,0,0,0,0,0
845,0,0,1,0,0,0,0,0,0,0,0,0


In [58]:
y_train.columns

Index(['Gym', 'Squash', 'Bodybuilding', 'Rowing', 'Cardio', 'Weightlifting',
       'Badminton', 'Crossfit', 'Fitness', 'Stretching', 'Power bike',
       'Kettleballs'],
      dtype='object')

In [59]:
y_train.sum()

Gym              135
Squash           138
Bodybuilding     129
Rowing           119
Cardio           114
Weightlifting    124
Badminton        117
Crossfit         109
Fitness          113
Stretching        96
Power bike       104
Kettleballs      104
dtype: int64

In [60]:
# hot_credit_card_type_na = data_is_na['credit_card_type'].str.get_dummies()
# hot_credit_card_type_na = hot_credit_card_type_na.drop(hot_credit_card_type_na.columns[0], axis=1)

# hot_relationship_status_na = data_is_na['relationship_status'].str.get_dummies()
# hot_relationship_status_na = hot_relationship_status_na.drop(hot_relationship_status_na.columns[0], axis=1)

# # df_main_na = pd.concat([data_is_na, part_hobbies_na], axis=1)
# df_main_na = pd.concat([data_is_na, hot_relationship_status_na], axis=1)
# df_main_na = pd.concat([df_main_na, hot_credit_card_type_na], axis=1)

# df_main_na = df_main_na.drop(columns=['location', 'occupation', 'credit_card_type', 'hobbies', 'relationship_status'])

In [61]:
# t_X_train = X_train[X_train.isna().any(axis=1)]
# n_X_train = X_train.dropna()

def transform_function(df, n):
    hobby_columns = df['hobbies'].str.get_dummies(',') # Save hobbies column as one hot encorder
    hobb = hobby_columns.sum().sort_values(ascending=False) # Sorting from largest to smallest
    hobb_100 = hobb[hobb > 100] # Choosing hobbies that repeat 100 or more times
    part_hobbies = hobby_columns[hobb_100.index] # saving the selected hobbies

    hot_credit_card_type = df['credit_card_type'].str.get_dummies() # Application of one hot encoding
    hot_credit_card_type = hot_credit_card_type.drop(hot_credit_card_type.columns[0], axis=1) # reduction in the number of columns

    hot_relationship_status = df['relationship_status'].str.get_dummies() # Application of one hot encoding
    hot_relationship_status = hot_relationship_status.drop(hot_relationship_status.columns[0], axis=1) # reduction in the number of columns

    if n == True:
        pass
    else:
        df = pd.concat([df, part_hobbies], axis=1)

    # df = pd.concat([df, part_hobbies], axis=1)
    df = pd.concat([df, hot_relationship_status], axis=1)
    df = pd.concat([df, hot_credit_card_type], axis=1)

    df = df.drop(columns=['location', 'occupation', 'credit_card_type', 'hobbies', 'relationship_status'])

    # if n == True:
    #     pass
    # else:
    #     df = pd.concat([df, part_hobbies], axis=1)


    return df

In [62]:
X_train

Unnamed: 0,location,location_population,occupation,hobbies,friends_number,sex,credit_card_type,relationship_status,education,dob,daily_commute
752,Kraków,767348,"Food processing, wood working, garment and oth...","Metal detecting,Mushroom hunting",307,1.0,Visa,In relationship,5.0,53.000000,73.000000
2520,Radomsko,46409,Electrical and electronic trades workers,"Kettleballs,Weightlifting",176,1.0,Mastercard,In relationship,4.0,36.000000,16.000000
1298,Ostróda,33243,Commissioned armed forces officers,Astronomy,52,0.0,Mastercard,In relationship,2.0,43.698598,41.000000
1904,Łódź,690422,Cleaners and helpers,"Lockpicking,Snowboarding,Basketball",375,1.0,Visa,In relationship,4.0,46.000000,50.704293
845,Jarocin,26245,Non-commissioned armed forces officers,Bodybuilding,223,1.0,Visa,Married,2.0,47.000000,89.000000
...,...,...,...,...,...,...,...,...,...,...,...
964,Radków,2427,"Hospitality, retail and other services managers","Crossfit,Power bike,Rock climbing,Nordic skating",0,0.0,Mastercard,Married,4.0,50.000000,22.000000
1755,Białystok,297288,"Legal, social and cultural professionals",Stretching,174,1.0,Visa,In relationship,5.0,54.000000,38.000000
856,Koszalin,107670,Information and communications technology prof...,"Weightlifting,Squash,Vacation",175,1.0,Mastercard,Married,2.0,36.000000,72.000000
2361,Rzeszów,189662,Health associate professionals,"Power bike,Motor sports",82,0.0,Visa,In relationship,1.0,38.000000,65.000000


In [63]:
X_train_trans = transform_function(X_train, True)

In [64]:
X_train_trans

Unnamed: 0,location_population,friends_number,sex,education,dob,daily_commute,In relationship,Married,Married with kids,Single,Mastercard,Revolut,Visa
752,767348,307,1.0,5.0,53.000000,73.000000,1,0,0,0,0,0,1
2520,46409,176,1.0,4.0,36.000000,16.000000,1,0,0,0,1,0,0
1298,33243,52,0.0,2.0,43.698598,41.000000,1,0,0,0,1,0,0
1904,690422,375,1.0,4.0,46.000000,50.704293,1,0,0,0,0,0,1
845,26245,223,1.0,2.0,47.000000,89.000000,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
964,2427,0,0.0,4.0,50.000000,22.000000,0,1,0,0,1,0,0
1755,297288,174,1.0,5.0,54.000000,38.000000,1,0,0,0,0,0,1
856,107670,175,1.0,2.0,36.000000,72.000000,0,1,0,0,1,0,0
2361,189662,82,0.0,1.0,38.000000,65.000000,1,0,0,0,0,0,1


In [65]:
# df_main_na.head()

Training the model on the training group and then applying the model to the data for the group with barracking data. -> creating a df with the results  
Each column has its own Logistic regression model

In [66]:
# n_X_train.count()

In [67]:
X_train_trans

Unnamed: 0,location_population,friends_number,sex,education,dob,daily_commute,In relationship,Married,Married with kids,Single,Mastercard,Revolut,Visa
752,767348,307,1.0,5.0,53.000000,73.000000,1,0,0,0,0,0,1
2520,46409,176,1.0,4.0,36.000000,16.000000,1,0,0,0,1,0,0
1298,33243,52,0.0,2.0,43.698598,41.000000,1,0,0,0,1,0,0
1904,690422,375,1.0,4.0,46.000000,50.704293,1,0,0,0,0,0,1
845,26245,223,1.0,2.0,47.000000,89.000000,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
964,2427,0,0.0,4.0,50.000000,22.000000,0,1,0,0,1,0,0
1755,297288,174,1.0,5.0,54.000000,38.000000,1,0,0,0,0,0,1
856,107670,175,1.0,2.0,36.000000,72.000000,0,1,0,0,1,0,0
2361,189662,82,0.0,1.0,38.000000,65.000000,1,0,0,0,0,0,1


In [68]:
# y_train[hobby].values

Tworzenie modeli

In [69]:
trained_models = {} # Dict with models
# new_df_m = pd.DataFrame() # df with predictions

for hobby in y_train.columns:
    model = LogisticRegression()
    model.fit(X_train_trans, np.array(y_train[hobby].values))
    trained_models[hobby] = model # save trained models to dict

Używanie modeli

In [70]:
pred_hobby = pd.DataFrame()

for hobby, model in trained_models.items():
    pred = model.predict(X_train_trans)
    pred_hobby[hobby] = pred


Przewidywania tragiczne 

In [71]:
pred_hobby.sum()

Gym              0
Squash           0
Bodybuilding     0
Rowing           0
Cardio           0
Weightlifting    0
Badminton        0
Crossfit         0
Fitness          0
Stretching       0
Power bike       0
Kettleballs      0
dtype: int64

In [72]:
y_train.sum()

Gym              135
Squash           138
Bodybuilding     129
Rowing           119
Cardio           114
Weightlifting    124
Badminton        117
Crossfit         109
Fitness          113
Stretching        96
Power bike       104
Kettleballs      104
dtype: int64

In [73]:
# trained_models = {} # Dict with models
# new_df_m = pd.DataFrame() # df with predictions

# for hobby in y_train.columns:
#     model = LogisticRegression()
#     model.fit(X_train_trans, np.array(y_train[hobby].values))
#     trained_models[hobby] = model # save trained models to dict
#     pred = model.predict(X_train_trans)
#     new_df_m[hobby] = pred

In [74]:
trained_models

{'Gym': LogisticRegression(),
 'Squash': LogisticRegression(),
 'Bodybuilding': LogisticRegression(),
 'Rowing': LogisticRegression(),
 'Cardio': LogisticRegression(),
 'Weightlifting': LogisticRegression(),
 'Badminton': LogisticRegression(),
 'Crossfit': LogisticRegression(),
 'Fitness': LogisticRegression(),
 'Stretching': LogisticRegression(),
 'Power bike': LogisticRegression(),
 'Kettleballs': LogisticRegression()}

In [75]:
# pred_hobby = pd.DataFrame()

# for hobby, model in trained_models.items():
#     pred = model.predict(X_train_trans)
#     pred_hobby[hobby] = pred

Saving models to a file in piccle format

Czemu wynik jest tak wysoki??

## Checking the results for the training set

In [76]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_train, pred_hobby)
print("Prediction precision:", accuracy)


Prediction precision: 0.4382538770821367


## Checking the results for the test set

In [80]:
X_test_trans = transform_function(X_test, True)

In [81]:
pred_hobby = pd.DataFrame()

for hobby, model in trained_models.items():
    pred = model.predict(X_test_trans)
    pred_hobby[hobby] = pred

In [82]:
accuracy = accuracy_score(y_test, pred_hobby)
print("Prediction precision:", accuracy)

Prediction precision: 0.43373493975903615


In [77]:
with open('models/all_trained_hobby_models.pkl', 'wb') as f:
    pickle.dump(trained_models, f)


Sprawdzanie ilości wiersz w uzyskanym df

In [78]:
new_df_m.count()

NameError: name 'new_df_m' is not defined

Niestety w uzyskanym df wszystkie wiersze mają wynik 0

In [None]:
new_df_m.sum()

In [None]:
new_df_m.head()

In [None]:
df_main_na.head()

In [None]:
df_pred = df_main_na.copy()
df_pred.reset_index(inplace=True)
df_pred[new_df_m.columns] = new_df_m
df_pred.head()

In [None]:
df_pred_all = pd.concat([df_pred, df_main], axis=0)
df_pred_all.drop(columns=['index'], inplace=True)
df_pred_all