In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime

In [2]:
# Importing a dataset
data =  pd.read_excel('data/GymSubs.xlsx')

In [3]:
data.head()

Unnamed: 0,user_id,target,name,sex,dob,location,location_population,occupation,hobbies,daily_commute,friends_number,relationship_status,education,credit_card_type
0,0,1,Halina,,1982-08-07,Piastów,22732,Teaching professionals,Fitness,46.0,196,Single,,Visa
1,1,0,Eustachy,male,1971-10-28,Sokółka,18331,General and keyboard clerks,"LARPing,Foreign language learning,Netball",55.0,243,Single,,
2,2,1,Egon,,2000-07-05,Łaskarzew,4879,Protective services workers,"Bodybuilding,Kabaddi",90.0,191,In relationship,3.0,
3,3,0,Eulalia,female,1992-06-10,Bydgoszcz,352313,Customer services clerks,Badminton,88.0,164,In relationship,3.0,Visa
4,4,0,Hilary,,1975-01-09,Osieczna,2322,Refuse workers and other elementary workers,"Fitness,Embroidery,Lacemaking",40.0,119,Married with kids,5.0,


In [4]:
NA = pd.DataFrame(data=[data.notna().sum().tolist(),
            data.isna().sum().tolist(), ["{:.2f}".format(i)+'%' \
           for i in (data.isna().sum()/data.shape[0]*100).tolist()]], 
           columns=data.columns, index=['Not NA Count', 'NA Count', 'NA Percent']).transpose()
NA

Unnamed: 0,Not NA Count,NA Count,NA Percent
user_id,4000,0,0.00%
target,4000,0,0.00%
name,3982,18,0.45%
sex,3616,384,9.60%
dob,3606,394,9.85%
location,4000,0,0.00%
location_population,4000,0,0.00%
occupation,4000,0,0.00%
hobbies,3320,680,17.00%
daily_commute,3595,405,10.12%


In [5]:
df = data.copy()

# Tworzenie mapowania
mapping = {'male': 0, 'female': 1}

# Zmiana wartości kolumny 'sex'
df['sex'] = df['sex'].map(mapping)

df.drop(columns=['user_id', 'name'], inplace=True)


df['dob'] = datetime.now().year - df['dob'].dt.year

In [6]:
df.dropna(inplace=True)

In [7]:
NA = pd.DataFrame(data=[df.notna().sum().tolist(),
            df.isna().sum().tolist(), ["{:.2f}".format(i)+'%' \
           for i in (df.isna().sum()/df.shape[0]*100).tolist()]], 
           columns=df.columns, index=['Not NA Count', 'NA Count', 'NA Percent']).transpose()
NA

Unnamed: 0,Not NA Count,NA Count,NA Percent
target,1734,0,0.00%
sex,1734,0,0.00%
dob,1734,0,0.00%
location,1734,0,0.00%
location_population,1734,0,0.00%
occupation,1734,0,0.00%
hobbies,1734,0,0.00%
daily_commute,1734,0,0.00%
friends_number,1734,0,0.00%
relationship_status,1734,0,0.00%


In [8]:
hobby_columns = df['hobbies'].str.get_dummies(',') # Save hobbies column as one hot encorder

# Łączenie nowych kolumn z oryginalnym dataframe
df_encoded = pd.concat([df, hobby_columns], axis=1).drop('hobbies', axis=1)

# df_encoded.drop('hobbies', axis=1, inplace=True)

# Wyświetlanie zdekodowanego dataframe
df_encoded.head()

Unnamed: 0,target,sex,dob,location,location_population,occupation,daily_commute,friends_number,relationship_status,education,...,Wood carving,Woodworking,Worldbuilding,Writing,Yo-yoing,Yoga,amateur radio,role-playing games,scrapbook,tabletop games
3,0,1.0,31.0,Bydgoszcz,352313,Customer services clerks,88.0,164,In relationship,3.0,...,0,0,0,0,0,0,0,0,0,0
7,0,0.0,45.0,Dobra,2353,Administrative and commercial managers,74.0,224,In relationship,4.0,...,0,0,0,0,0,0,0,0,0,0
12,0,0.0,47.0,Rzeszów,189662,Drivers and mobile plant operators,32.0,229,In relationship,5.0,...,0,0,0,0,0,0,0,0,0,0
14,0,1.0,56.0,Opole,128140,Health associate professionals,48.0,326,In relationship,1.0,...,0,0,0,0,0,0,0,0,0,0
20,0,1.0,33.0,Zawadzkie,7250,Information and communications technicians,6.0,163,Married,6.0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df_card = pd.get_dummies(df_encoded[['relationship_status', 'credit_card_type']], drop_first=True).astype(int)
df_encoded = pd.concat([df_encoded, df_card], axis=1).drop(columns=['relationship_status', 'credit_card_type'], axis=1)

In [10]:
df_encoded.drop(columns=['location', 'occupation'], inplace=True)

In [11]:
df_encoded.to_csv('data/df_encodet.csv')

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier



# Podział danych na cechy (features) i etykiety (labels)
features = df_encoded.drop(['sex'], axis=1)
labels = df_encoded['sex']

# Podział danych na zbiór treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Utworzenie modelu RandomForestClassifier
model = RandomForestClassifier()

# Trenowanie modelu
model.fit(X_train, y_train)

In [13]:
import pickle

# Zakładając, że 'model' jest w pełni wytrenowanym modelem

# Zapisz model do pliku
with open('models/model_sex.pkl', 'wb') as file:
    pickle.dump(model, file)
