In [180]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import pickle

In [181]:
# Importing a dataset
data =  pd.read_csv('data/for_model.csv', index_col = [0])
y = pd.read_csv("data/for_model_y.csv", index_col = [0])

In [182]:
data.head()

Unnamed: 0,location,location_population,occupation,hobbies,friends_number,sex,credit_card_type,relationship_status,education,dob,daily_commute
0,Chrzanów,37123,Numerical and material recording clerks,"Stretching,Cooking,Parkour",268,1.0,Visa,In relationship,3.0,34.0,32.0
1,Konstancin-Jeziorna,17191,Production and specialised services managers,"Squash,Cardio,Astronomy",204,0.0,Mastercard,In relationship,3.0,36.0,5.0
2,Stalowa Wola,61903,Personal care workers,"Cardio,Power bike,Wood carving,Acting,Rappelling",128,1.0,Visa,In relationship,3.0,23.0,18.0
3,Braniewo,17074,"Armed forces occupations, other ranks","Bodybuilding,Fitness,Skimboarding",156,1.0,Visa,Married with kids,5.0,56.0,76.0
4,Sieniawa,2146,General and keyboard clerks,"Cardio,Rowing,Gym,Dowsing",182,0.0,Visa,In relationship,5.0,37.0,45.0


We can see that the data is less because we are only working on training data

In [183]:
NA = pd.DataFrame(data=[data.notna().sum().tolist(),
            data.isna().sum().tolist(), ["{:.2f}".format(i)+'%' \
           for i in (data.isna().sum()/data.shape[0]*100).tolist()]], 
           columns=data.columns, index=['Not NA Count', 'NA Count', 'NA Percent']).transpose()
NA

Unnamed: 0,Not NA Count,NA Count,NA Percent
location,2800,0,0.00%
location_population,2800,0,0.00%
occupation,2800,0,0.00%
hobbies,2322,478,17.07%
friends_number,2800,0,0.00%
sex,2800,0,0.00%
credit_card_type,2800,0,0.00%
relationship_status,2800,0,0.00%
education,2800,0,0.00%
dob,2800,0,0.00%


Data that does not have missing values

In [184]:
data_no_na = data.dropna()
data_no_na.count()

location               2322
location_population    2322
occupation             2322
hobbies                2322
friends_number         2322
sex                    2322
credit_card_type       2322
relationship_status    2322
education              2322
dob                    2322
daily_commute          2322
dtype: int64

In [185]:
data_is_na = data[data.isna().any(axis=1)]
data_is_na

Unnamed: 0,location,location_population,occupation,hobbies,friends_number,sex,credit_card_type,relationship_status,education,dob,daily_commute
10,Hajnówka,20919,"Armed forces occupations, other ranks",,254,1.0,American Express,Married with kids,5.0,68.0,64.000000
11,Jastrzębie-Zdrój,89590,"Legal, social and cultural professionals",,341,1.0,Visa,Single,5.0,25.0,38.000000
12,Jastrzębie-Zdrój,89590,"Agricultural, forestry and fishery labourers",,229,1.0,Mastercard,Married,2.0,53.0,37.000000
13,Jaworzno,92090,Commissioned armed forces officers,,244,0.0,Visa,In relationship,5.0,27.0,21.000000
34,Bytom,168394,Other clerical support workers,,267,1.0,Visa,Married,2.0,57.0,88.000000
...,...,...,...,...,...,...,...,...,...,...,...
2760,Warszawa,1764615,"Legal, social, cultural and related associate ...",,188,0.0,Visa,Single,2.0,22.0,5.000000
2776,Włocławek,111752,"Food processing, wood working, garment and oth...",,307,1.0,Mastercard,In relationship,3.0,49.0,50.704293
2782,Tarnów,109650,Non-commissioned armed forces officers,,296,1.0,Visa,Divorced,4.0,78.0,46.838235
2787,Kraków,767348,Market-oriented skilled agricultural workers,,213,1.0,Mastercard,Married,1.0,68.0,50.000000


In [186]:
hobby_columns = data_no_na['hobbies'].str.get_dummies(',') # Save hobbies column as one hot encorder
hobby_columns.head()

Unnamed: 0,3D printing,Acting,Air sports,Amateur radio,Archery,Astronomy,BASE jumping,Backpacking,Badminton,Baseball,...,Wood carving,Woodworking,Worldbuilding,Writing,Yo-yoing,Yoga,amateur radio,role-playing games,scrapbook,tabletop games
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


There are 174 different hobbies in the data we have

We examine what hobbies occur most frequently

In [187]:
hobb = hobby_columns.sum().sort_values(ascending=False)
hobb.head(12)

Gym              183
Squash           183
Bodybuilding     178
Rowing           171
Cardio           161
Weightlifting    158
Badminton        149
Crossfit         147
Fitness          145
Stretching       143
Power bike       143
Kettleballs      135
dtype: int64

We only choose the hobbies that are most common (greater than 100)

In [188]:
hobb_100 = hobb[hobb > 100]

In [189]:
hobb_100.head(20)

Gym              183
Squash           183
Bodybuilding     178
Rowing           171
Cardio           161
Weightlifting    158
Badminton        149
Crossfit         147
Fitness          145
Stretching       143
Power bike       143
Kettleballs      135
dtype: int64

In [190]:
part_hobbies = hobby_columns[hobb_100.index]
part_hobbies.head()

Unnamed: 0,Gym,Squash,Bodybuilding,Rowing,Cardio,Weightlifting,Badminton,Crossfit,Fitness,Stretching,Power bike,Kettleballs
0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,1,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,1,0
3,0,0,1,0,0,0,0,0,1,0,0,0
4,1,0,0,1,1,0,0,0,0,0,0,0


In [191]:
hot_credit_card_type = data_no_na['credit_card_type'].str.get_dummies() # Application of one hot encoding
hot_credit_card_type = hot_credit_card_type.drop(hot_credit_card_type.columns[0], axis=1) # reduction in the number of columns
hot_credit_card_type.head()

Unnamed: 0,Mastercard,Revolut,Visa
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [192]:
hot_credit_card_type.sum()

Mastercard     929
Revolut         92
Visa          1282
dtype: int64

In [193]:
hot_relationship_status = data_no_na['relationship_status'].str.get_dummies() # Application of one hot encoding
hot_relationship_status = hot_relationship_status.drop(hot_relationship_status.columns[0], axis=1) # reduction in the number of columns
hot_relationship_status.head()

Unnamed: 0,In relationship,Married,Married with kids,Single
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,0,0,1,0
4,1,0,0,0


In [194]:
hot_relationship_status.sum()

In relationship      970
Married              476
Married with kids    396
Single               349
dtype: int64

Combining fransformed data into a single table

In [195]:
df_main = pd.concat([data_no_na, part_hobbies], axis=1)
df_main = pd.concat([df_main, hot_relationship_status], axis=1)
df_main = pd.concat([df_main, hot_credit_card_type], axis=1)

Removing unnecessary columns

In [196]:
df_main = df_main.drop(columns=['location', 'occupation', 'credit_card_type', 'hobbies', 'relationship_status'])

In [197]:
df_main.head()

Unnamed: 0,location_population,friends_number,sex,education,dob,daily_commute,Gym,Squash,Bodybuilding,Rowing,...,Stretching,Power bike,Kettleballs,In relationship,Married,Married with kids,Single,Mastercard,Revolut,Visa
0,37123,268,1.0,3.0,34.0,32.0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,1
1,17191,204,0.0,3.0,36.0,5.0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0
2,61903,128,1.0,3.0,23.0,18.0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,1
3,17074,156,1.0,5.0,56.0,76.0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,1
4,2146,182,0.0,5.0,37.0,45.0,1,0,0,1,...,0,0,0,1,0,0,0,0,0,1


In [198]:
df_m = df_main.drop(columns=part_hobbies.columns)

In [199]:
df_m.head()

Unnamed: 0,location_population,friends_number,sex,education,dob,daily_commute,In relationship,Married,Married with kids,Single,Mastercard,Revolut,Visa
0,37123,268,1.0,3.0,34.0,32.0,1,0,0,0,0,0,1
1,17191,204,0.0,3.0,36.0,5.0,1,0,0,0,1,0,0
2,61903,128,1.0,3.0,23.0,18.0,1,0,0,0,0,0,1
3,17074,156,1.0,5.0,56.0,76.0,0,0,1,0,0,0,1
4,2146,182,0.0,5.0,37.0,45.0,1,0,0,0,0,0,1


In [200]:
df_m.count()

location_population    2322
friends_number         2322
sex                    2322
education              2322
dob                    2322
daily_commute          2322
In relationship        2322
Married                2322
Married with kids      2322
Single                 2322
Mastercard             2322
Revolut                2322
Visa                   2322
dtype: int64

In [201]:
X_train, X_test, y_train, y_test = train_test_split(df_m, part_hobbies, test_size = 0.25)

Displaying the training set

In [202]:
y_train.head()

Unnamed: 0,Gym,Squash,Bodybuilding,Rowing,Cardio,Weightlifting,Badminton,Crossfit,Fitness,Stretching,Power bike,Kettleballs
266,0,0,0,0,0,0,0,0,1,0,0,0
2778,0,0,0,0,0,0,1,0,0,0,0,0
1424,0,0,0,0,0,0,1,0,0,0,0,0
434,0,0,0,0,0,0,0,0,1,0,0,0
26,0,0,0,0,0,0,0,0,0,0,1,0


In [203]:
y_train.columns

Index(['Gym', 'Squash', 'Bodybuilding', 'Rowing', 'Cardio', 'Weightlifting',
       'Badminton', 'Crossfit', 'Fitness', 'Stretching', 'Power bike',
       'Kettleballs'],
      dtype='object')

In [204]:
y_train.sum()

Gym              131
Squash           134
Bodybuilding     133
Rowing           132
Cardio           130
Weightlifting    118
Badminton        110
Crossfit         112
Fitness          109
Stretching       109
Power bike       112
Kettleballs       98
dtype: int64

In [205]:
# hot_credit_card_type_na = data_is_na['credit_card_type'].str.get_dummies()
# hot_credit_card_type_na = hot_credit_card_type_na.drop(hot_credit_card_type_na.columns[0], axis=1)

# hot_relationship_status_na = data_is_na['relationship_status'].str.get_dummies()
# hot_relationship_status_na = hot_relationship_status_na.drop(hot_relationship_status_na.columns[0], axis=1)

# # df_main_na = pd.concat([data_is_na, part_hobbies_na], axis=1)
# df_main_na = pd.concat([data_is_na, hot_relationship_status_na], axis=1)
# df_main_na = pd.concat([df_main_na, hot_credit_card_type_na], axis=1)

# df_main_na = df_main_na.drop(columns=['location', 'occupation', 'credit_card_type', 'hobbies', 'relationship_status'])

In [206]:
t_X_train = X_train[X_train.isna().any(axis=1)]
n_X_train = X_train.dropna()

def transform_function(df, n):
    hobby_columns = df['hobbies'].str.get_dummies(',') # Save hobbies column as one hot encorder
    hobb = hobby_columns.sum().sort_values(ascending=False)
    hobb_100 = hobb[hobb > 100]
    part_hobbies = hobby_columns[hobb_100.index]

    hot_credit_card_type = df['credit_card_type'].str.get_dummies() # Application of one hot encoding
    hot_credit_card_type = hot_credit_card_type.drop(hot_credit_card_type.columns[0], axis=1) # reduction in the number of columns

    hot_relationship_status = df['relationship_status'].str.get_dummies() # Application of one hot encoding
    hot_relationship_status = hot_relationship_status.drop(hot_relationship_status.columns[0], axis=1) # reduction in the number of columns

    df = pd.concat([df, part_hobbies], axis=1)
    df = pd.concat([df, hot_relationship_status], axis=1)
    df = pd.concat([df, hot_credit_card_type], axis=1)

    df = df.drop(columns=['location', 'occupation', 'credit_card_type', 'hobbies', 'relationship_status'])

    if n == True:
        df = df.drop(columns=part_hobbies.columns)

    return df

In [207]:
df_main_na.head()

Unnamed: 0,location_population,friends_number,sex,education,dob,daily_commute,In relationship,Married,Married with kids,Single,Mastercard,Revolut,Visa
10,20919,254,1.0,5.0,68.0,64.0,0,0,1,0,0,0,0
11,89590,341,1.0,5.0,25.0,38.0,0,0,0,1,0,0,1
12,89590,229,1.0,2.0,53.0,37.0,0,1,0,0,1,0,0
13,92090,244,0.0,5.0,27.0,21.0,1,0,0,0,0,0,1
34,168394,267,1.0,2.0,57.0,88.0,0,1,0,0,0,0,1


Training the model on the training group and then applying the model to the data for the group with barracking data. -> creating a df with the results  
Each column has its own Logistic regression model

In [208]:
n_X_train.count()

location_population    1741
friends_number         1741
sex                    1741
education              1741
dob                    1741
daily_commute          1741
In relationship        1741
Married                1741
Married with kids      1741
Single                 1741
Mastercard             1741
Revolut                1741
Visa                   1741
dtype: int64

In [209]:
trained_models = {} # Dict with models
new_df_m = pd.DataFrame() # df with predictions

for hobby in y_train.columns:
    model = LogisticRegression()
    model.fit(X_train, np.array(y_train[hobby].values))
    trained_models[hobby] = model # save trained models to dict
    pred = model.predict(df_main_na)
    new_df_m[hobby] = pred

In [210]:
trained_models

{'Gym': LogisticRegression(),
 'Squash': LogisticRegression(),
 'Bodybuilding': LogisticRegression(),
 'Rowing': LogisticRegression(),
 'Cardio': LogisticRegression(),
 'Weightlifting': LogisticRegression(),
 'Badminton': LogisticRegression(),
 'Crossfit': LogisticRegression(),
 'Fitness': LogisticRegression(),
 'Stretching': LogisticRegression(),
 'Power bike': LogisticRegression(),
 'Kettleballs': LogisticRegression()}

In [211]:
pred_hobby = pd.DataFrame()

for hobby, model in trained_models.items():
    pred = model.predict(n_X_train)
    pred_hobby[hobby] = pred

Saving models to a file in piccle format

In [212]:
pred_hobby.sum()

Gym              0
Squash           0
Bodybuilding     0
Rowing           0
Cardio           0
Weightlifting    0
Badminton        0
Crossfit         0
Fitness          0
Stretching       0
Power bike       0
Kettleballs      0
dtype: int64

In [213]:
t_X_train

Unnamed: 0,location_population,friends_number,sex,education,dob,daily_commute,In relationship,Married,Married with kids,Single,Mastercard,Revolut,Visa


In [214]:
pred_hobby_check = pd.DataFrame()

for hobby, model in trained_models.items():
    pred = model.predict(t_X_train)
    pred_hobby[hobby] = pred

ValueError: Found array with 0 sample(s) (shape=(0, 13)) while a minimum of 1 is required by LogisticRegression.

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_train, pred_hobby_check)
print("Prediction precision:", accuracy)

NameError: name 'pred_hobby_check' is not defined

In [None]:
with open('models/all_trained_hobby_models.pkl', 'wb') as f:
    pickle.dump(trained_models, f)


Sprawdzanie ilości wiersz w uzyskanym df

In [None]:
new_df_m.count()

Gym              478
Squash           478
Bodybuilding     478
Rowing           478
Cardio           478
Weightlifting    478
Badminton        478
Crossfit         478
Fitness          478
Stretching       478
Power bike       478
Kettleballs      478
dtype: int64

Niestety w uzyskanym df wszystkie wiersze mają wynik 0

In [None]:
new_df_m.sum()

Gym              1
Squash           0
Bodybuilding     0
Rowing           0
Cardio           0
Weightlifting    0
Badminton        0
Crossfit         0
Fitness          0
Stretching       0
Power bike       0
Kettleballs      0
dtype: int64

In [None]:
new_df_m.head()

Unnamed: 0,Gym,Squash,Bodybuilding,Rowing,Cardio,Weightlifting,Badminton,Crossfit,Fitness,Stretching,Power bike,Kettleballs
0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
df_main_na.head()

Unnamed: 0,location_population,friends_number,sex,education,dob,daily_commute,In relationship,Married,Married with kids,Single,Mastercard,Revolut,Visa
10,20919,254,1.0,5.0,68.0,64.0,0,0,1,0,0,0,0
11,89590,341,1.0,5.0,25.0,38.0,0,0,0,1,0,0,1
12,89590,229,1.0,2.0,53.0,37.0,0,1,0,0,1,0,0
13,92090,244,0.0,5.0,27.0,21.0,1,0,0,0,0,0,1
34,168394,267,1.0,2.0,57.0,88.0,0,1,0,0,0,0,1


In [None]:
df_pred = df_main_na.copy()
df_pred.reset_index(inplace=True)
df_pred[new_df_m.columns] = new_df_m
df_pred.head()

Unnamed: 0,index,location_population,friends_number,sex,education,dob,daily_commute,In relationship,Married,Married with kids,...,Bodybuilding,Rowing,Cardio,Weightlifting,Badminton,Crossfit,Fitness,Stretching,Power bike,Kettleballs
0,10,20919,254,1.0,5.0,68.0,64.0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,11,89590,341,1.0,5.0,25.0,38.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,12,89590,229,1.0,2.0,53.0,37.0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,13,92090,244,0.0,5.0,27.0,21.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,34,168394,267,1.0,2.0,57.0,88.0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_pred_all = pd.concat([df_pred, df_main], axis=0)
df_pred_all.drop(columns=['index'], inplace=True)
df_pred_all

Unnamed: 0,location_population,friends_number,sex,education,dob,daily_commute,In relationship,Married,Married with kids,Single,...,Bodybuilding,Rowing,Cardio,Weightlifting,Badminton,Crossfit,Fitness,Stretching,Power bike,Kettleballs
0,20919,254,1.0,5.0,68.000000,64.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,89590,341,1.0,5.0,25.000000,38.0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,89590,229,1.0,2.0,53.000000,37.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,92090,244,0.0,5.0,27.000000,21.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,168394,267,1.0,2.0,57.000000,88.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2794,403883,254,0.0,3.0,51.000000,81.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2796,111752,178,0.0,5.0,70.000000,90.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2797,109021,226,0.0,3.0,36.000000,62.0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2798,638586,216,0.0,3.0,45.265734,91.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
