In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import pickle

In [2]:
# Importing a dataset
data =  pd.read_csv('data/for_model.csv', index_col = [0])
y = pd.read_csv("data/for_model_y.csv", index_col = [0])

In [3]:
data.head()

Unnamed: 0,location,location_population,occupation,hobbies,friends_number,sex,credit_card_type,relationship_status,education,dob,daily_commute
0,Chrzanów,37123,Numerical and material recording clerks,"Stretching,Cooking,Parkour",268,1.0,Visa,In relationship,3.0,34.0,32.0
1,Konstancin-Jeziorna,17191,Production and specialised services managers,"Squash,Cardio,Astronomy",204,0.0,Mastercard,In relationship,3.0,36.0,5.0
2,Stalowa Wola,61903,Personal care workers,"Cardio,Power bike,Wood carving,Acting,Rappelling",128,1.0,Visa,In relationship,3.0,23.0,18.0
3,Braniewo,17074,"Armed forces occupations, other ranks","Bodybuilding,Fitness,Skimboarding",156,1.0,Visa,Married with kids,5.0,56.0,76.0
4,Sieniawa,2146,General and keyboard clerks,"Cardio,Rowing,Gym,Dowsing",182,0.0,Visa,In relationship,5.0,37.0,45.0


We can see that the data is less because we are only working on training data

In [4]:
NA = pd.DataFrame(data=[data.notna().sum().tolist(),
            data.isna().sum().tolist(), ["{:.2f}".format(i)+'%' \
           for i in (data.isna().sum()/data.shape[0]*100).tolist()]], 
           columns=data.columns, index=['Not NA Count', 'NA Count', 'NA Percent']).transpose()
NA

Unnamed: 0,Not NA Count,NA Count,NA Percent
location,2800,0,0.00%
location_population,2800,0,0.00%
occupation,2800,0,0.00%
hobbies,2322,478,17.07%
friends_number,2800,0,0.00%
sex,2800,0,0.00%
credit_card_type,2800,0,0.00%
relationship_status,2800,0,0.00%
education,2800,0,0.00%
dob,2800,0,0.00%


Data that does not have missing values

In [5]:
data_no_na = data.dropna()
data_no_na.count()

location               2322
location_population    2322
occupation             2322
hobbies                2322
friends_number         2322
sex                    2322
credit_card_type       2322
relationship_status    2322
education              2322
dob                    2322
daily_commute          2322
dtype: int64

In [6]:
data_is_na = data[data.isna().any(axis=1)]
data_is_na

Unnamed: 0,location,location_population,occupation,hobbies,friends_number,sex,credit_card_type,relationship_status,education,dob,daily_commute
10,Hajnówka,20919,"Armed forces occupations, other ranks",,254,1.0,American Express,Married with kids,5.0,68.0,64.000000
11,Jastrzębie-Zdrój,89590,"Legal, social and cultural professionals",,341,1.0,Visa,Single,5.0,25.0,38.000000
12,Jastrzębie-Zdrój,89590,"Agricultural, forestry and fishery labourers",,229,1.0,Mastercard,Married,2.0,53.0,37.000000
13,Jaworzno,92090,Commissioned armed forces officers,,244,0.0,Visa,In relationship,5.0,27.0,21.000000
34,Bytom,168394,Other clerical support workers,,267,1.0,Visa,Married,2.0,57.0,88.000000
...,...,...,...,...,...,...,...,...,...,...,...
2760,Warszawa,1764615,"Legal, social, cultural and related associate ...",,188,0.0,Visa,Single,2.0,22.0,5.000000
2776,Włocławek,111752,"Food processing, wood working, garment and oth...",,307,1.0,Mastercard,In relationship,3.0,49.0,50.704293
2782,Tarnów,109650,Non-commissioned armed forces officers,,296,1.0,Visa,Divorced,4.0,78.0,46.838235
2787,Kraków,767348,Market-oriented skilled agricultural workers,,213,1.0,Mastercard,Married,1.0,68.0,50.000000


In [7]:
hobby_columns = data_no_na['hobbies'].str.get_dummies(',') # Save hobbies column as one hot encorder
hobby_columns.head()

Unnamed: 0,3D printing,Acting,Air sports,Amateur radio,Archery,Astronomy,BASE jumping,Backpacking,Badminton,Baseball,...,Wood carving,Woodworking,Worldbuilding,Writing,Yo-yoing,Yoga,amateur radio,role-playing games,scrapbook,tabletop games
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


There are 174 different hobbies in the data we have

We examine what hobbies occur most frequently

In [8]:
hobb = hobby_columns.sum().sort_values(ascending=False)
hobb.head(20)

Gym                 183
Squash              183
Bodybuilding        178
Rowing              171
Cardio              161
Weightlifting       158
Badminton           149
Crossfit            147
Fitness             145
Stretching          143
Power bike          143
Kettleballs         135
Parkour              40
Poi                  39
Sculpting            39
Skydiving            36
Drama                36
Book restoration     36
Taxidermy            35
Geocaching           35
dtype: int64

We only choose the hobbies that are most common (greater than 100)

In [9]:
hobb_100 = hobb[hobb > 100]

In [10]:
part_hobbies = hobby_columns[hobb_100.index]
part_hobbies.head()

Unnamed: 0,Gym,Squash,Bodybuilding,Rowing,Cardio,Weightlifting,Badminton,Crossfit,Fitness,Stretching,Power bike,Kettleballs
0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,1,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,1,0
3,0,0,1,0,0,0,0,0,1,0,0,0
4,1,0,0,1,1,0,0,0,0,0,0,0


In [11]:
# import pandas as pd

# data = {
#     'Gym': [0, 0, 0, 0, 1],
#     'Squash': [0, 1, 0, 0, 0],
#     'Bodybuilding': [0, 0, 0, 1, 0],
#     'Rowing': [0, 0, 0, 0, 1],
#     'Cardio': [0, 1, 1, 0, 1],
#     'Weightlifting': [0, 0, 0, 0, 0],
#     'Badminton': [0, 0, 0, 0, 0],
#     'Crossfit': [0, 0, 0, 0, 0],
#     'Fitness': [0, 0, 0, 1, 0],
#     'Stretching': [0, 0, 0, 0, 0],
#     'Power bike': [1, 0, 0, 0, 0],
#     'Kettlebells': [0, 0, 1, 0, 0]
# }

# df = pd.DataFrame(data)

# exercise_columns = df.columns

# # Calculate numeric values for each row
# df['NumericValue'] = df.apply(lambda row: sum(2**i for i, val in enumerate(row) if val), axis=1)

# print(df)

In [12]:
df = pd.DataFrame(data)

Divide dataset into training and test data 

In [13]:
X_train, X_test, y_train, y_test = train_test_split(data_no_na, part_hobbies, test_size = 0.25, random_state=1)

Displaying the training set

In [14]:
y_train.head()

Unnamed: 0,Gym,Squash,Bodybuilding,Rowing,Cardio,Weightlifting,Badminton,Crossfit,Fitness,Stretching,Power bike,Kettleballs
1044,0,0,0,0,0,0,0,0,0,0,0,1
1291,1,0,0,0,1,0,0,0,0,0,0,0
671,0,0,1,0,0,0,0,0,0,0,0,0
1094,0,0,0,0,0,0,0,0,0,0,0,0
885,0,0,0,0,0,0,0,0,0,0,0,0


In [15]:
y_train.columns

Index(['Gym', 'Squash', 'Bodybuilding', 'Rowing', 'Cardio', 'Weightlifting',
       'Badminton', 'Crossfit', 'Fitness', 'Stretching', 'Power bike',
       'Kettleballs'],
      dtype='object')

In [16]:
y_train.sum()

Gym              130
Squash           129
Bodybuilding     137
Rowing           133
Cardio           109
Weightlifting    130
Badminton        112
Crossfit         100
Fitness          101
Stretching       113
Power bike       112
Kettleballs      105
dtype: int64

Jedna funkcja jedna rzecz

In [17]:
def hobbies_100(df):
    hobby_columns = df['hobbies'].str.get_dummies(',') # Save hobbies column as one hot encorder
    hobb = hobby_columns.sum().sort_values(ascending=False) # Sorting from largest to smallest
    hobb_100 = hobb[hobb > 100] # Choosing hobbies that repeat 100 or more times
    return hobby_columns[hobb_100.index] # saving the selected hobbies

In [18]:
testss = hobbies_100(df)

In [19]:
testss.columns

Index(['Gym', 'Squash', 'Bodybuilding', 'Rowing', 'Cardio', 'Weightlifting',
       'Badminton', 'Crossfit', 'Fitness', 'Stretching', 'Power bike',
       'Kettleballs'],
      dtype='object')

In [20]:
# testss['sum'] = testss[testss.columns].sum(axis=1)
testss


Unnamed: 0,Gym,Squash,Bodybuilding,Rowing,Cardio,Weightlifting,Badminton,Crossfit,Fitness,Stretching,Power bike,Kettleballs
0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,1,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,1,0
3,0,0,1,0,0,0,0,0,1,0,0,0
4,1,0,0,1,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2795,0,0,0,0,0,0,0,0,0,0,0,0
2796,0,0,0,0,0,0,0,0,0,0,0,0
2797,0,0,0,0,0,0,0,0,0,0,0,0
2798,0,0,0,0,0,0,0,0,0,0,0,0


In [21]:
# # Wyszukaj kolumnę z wartością 1 w wierszu 1 (drugim wierszu)
# row_index = 1  # Indeks wiersza (numerowany od zera)
# column_with_value_1 = transformed_df.iloc[row_index, :-1].idxmax()

# print("Kolumna z wartością 1 w wierszu 1:", column_with_value_1)


In [22]:
# df.columns.get_loc(column_name)

In [23]:
testss

Unnamed: 0,Gym,Squash,Bodybuilding,Rowing,Cardio,Weightlifting,Badminton,Crossfit,Fitness,Stretching,Power bike,Kettleballs
0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,1,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,1,0
3,0,0,1,0,0,0,0,0,1,0,0,0
4,1,0,0,1,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2795,0,0,0,0,0,0,0,0,0,0,0,0
2796,0,0,0,0,0,0,0,0,0,0,0,0
2797,0,0,0,0,0,0,0,0,0,0,0,0
2798,0,0,0,0,0,0,0,0,0,0,0,0


In [24]:
testss.loc[1,:].sum()

2

In [25]:
import numpy as np

encoded_matrix = np.array([[0, 1, 0, 0],
                           [1, 0, 0, 0],
                           [0, 0, 0, 1]])

original_labels = testss.index + 2

decoded_labels = [original_labels[np.where(row == 1)[0][0]] for row in encoded_matrix]

print(decoded_labels)  # Wyświetli: ['B', 'A', 'D']


[3, 2, 5]


In [26]:
# original_labels = testss.index + 2
# original_labels[np.where(row == 1)[0][0]]

In [34]:
values = [0, 0, 0, 0, 0]

indices_with_one = [index for index, value in enumerate(values) if value == 1 else -1]

print(indices_with_one[0])


SyntaxError: invalid syntax (3738665058.py, line 3)

In [37]:
    values = row[1]

    indices_with_one = [index for index, value in enumerate(values) if value == 1]

    if not indices_with_one:
        result = 100
    elif len(indices_with_one) > 1:
        result = 200
    else:
        result = indices_with_one[0]

print(result)


3


In [45]:
testss

Unnamed: 0,Gym,Squash,Bodybuilding,Rowing,Cardio,Weightlifting,Badminton,Crossfit,Fitness,Stretching,Power bike,Kettleballs
0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,1,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,1,0
3,0,0,1,0,0,0,0,0,1,0,0,0
4,1,0,0,1,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2795,0,0,0,0,0,0,0,0,0,0,0,0
2796,0,0,0,0,0,0,0,0,0,0,0,0
2797,0,0,0,0,0,0,0,0,0,0,0,0
2798,0,0,0,0,0,0,0,0,0,0,0,0


In [43]:
def calculate(row):
    values = row

    indices_with_one = [index for index, value in enumerate(values) if value == 1]

    if not indices_with_one:
        return 0
    elif len(indices_with_one) > 1:
        return 14
    else:
        return indices_with_one[0] + 2

In [47]:
testss['new_col'] = testss.apply(calculate, axis=1).values
testss

Unnamed: 0,Gym,Squash,Bodybuilding,Rowing,Cardio,Weightlifting,Badminton,Crossfit,Fitness,Stretching,Power bike,Kettleballs,new_col
0,0,0,0,0,0,0,0,0,0,1,0,0,11
1,0,1,0,0,1,0,0,0,0,0,0,0,14
2,0,0,0,0,1,0,0,0,0,0,1,0,14
3,0,0,1,0,0,0,0,0,1,0,0,0,14
4,1,0,0,1,1,0,0,0,0,0,0,0,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2795,0,0,0,0,0,0,0,0,0,0,0,0,0
2796,0,0,0,0,0,0,0,0,0,0,0,0,0
2797,0,0,0,0,0,0,0,0,0,0,0,0,0
2798,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# def calculate_value(row):
#     total_sum = row.sum()
#     if total_sum == 0:
#         return 1
#     elif total_sum > 1:
#         return 14
#     else:
#         return (row[row == 1].index + 2)
        

def calculate_value(row):
    # print(row[1])
    total_sum = row.sum()
    if total_sum == 0:
        return 1
    elif total_sum > 1:
        return 14
    else:
        indices_with_one = [index for index, value in enumerate([row[1]]) if value == 1]
        return indices_with_one[0]


tos = testss.copy()   
tos['new_column'] = testss.apply(calculate_value, axis=1)
tos

In [None]:
import pandas as pd

# Assuming 'series' is your Pandas Series
series = pd.Series([0, 1, 0, 1, 0, 1])

# Find elements with value 1
elements_with_value_1 = series[series == 1].index + 2

print("Elements with value 1:")
print(elements_with_value_1)

In [None]:
import pandas as pd

def transform_table(df):
    def calculate_value(row):
        total_sum = row.sum()
        if total_sum == 0:
            return 1
        elif total_sum == 1:
            return row.idxmax() + 1
        else:
            return 14

    df['new_column'] = df.apply(calculate_value, axis=1)
    return df[['new_column']]

# Przykładowa tabela
data = {
    'Gym': [0, 0, 0, 0, 1],
    'Squash': [0, 1, 0, 0, 0],
    'Bodybuilding': [0, 0, 0, 1, 0],
    'Rowing': [0, 0, 0, 0, 1],
    'Cardio': [0, 1, 1, 0, 1],
    'Weightlifting': [0, 0, 0, 0, 0],
    'Badminton': [0, 0, 0, 0, 0],
    'Crossfit': [0, 0, 0, 0, 0],
    'Fitness': [0, 0, 0, 1, 0],
    'Stretching': [1, 0, 0, 0, 0],
    'Power bike': [0, 0, 1, 0, 0],
    'Kettleballs': [0, 0, 0, 0, 0],
    'sum': [1, 2, 2, 2, 3]
}

df = pd.DataFrame(data)

transformed_df = transform_table(df)
print(transformed_df)


In [None]:
testss[testss['sum'] == 1]

In [None]:
def one_hot_column(df, col):
    hot_col = df[col].str.get_dummies() # Application of one hot encoding
    return hot_col.drop(hot_col.columns[0], axis=1) # reduction in the number of columns

In [None]:
def t_function(df):
    hot_credit_card_type = one_hot_column(df,'credit_card_type')
    hot_relationship_status = one_hot_column(df,'relationship_status')

    # Linking tables with data as one hot encoder
    df = pd.concat([df, hot_relationship_status], axis=1)
    df = pd.concat([df, hot_credit_card_type], axis=1)

    df = df.drop(columns=['location', 'occupation', 'credit_card_type', 'hobbies', 'relationship_status']) # Deleting columns
    return df

In [None]:
# def transform_function(df, n):
#     hobby_columns = df['hobbies'].str.get_dummies(',') # Save hobbies column as one hot encorder
#     hobb = hobby_columns.sum().sort_values(ascending=False) # Sorting from largest to smallest
#     hobb_100 = hobb[hobb > 100] # Choosing hobbies that repeat 100 or more times
#     part_hobbies = hobby_columns[hobb_100.index] # saving the selected hobbies

#     hot_credit_card_type = df['credit_card_type'].str.get_dummies() # Application of one hot encoding
#     hot_credit_card_type = hot_credit_card_type.drop(hot_credit_card_type.columns[0], axis=1) # reduction in the number of columns

#     hot_relationship_status = df['relationship_status'].str.get_dummies() # Application of one hot encoding
#     hot_relationship_status = hot_relationship_status.drop(hot_relationship_status.columns[0], axis=1) # reduction in the number of columns

#     # Conditional addition of hobby columns 
#     if n == True:
#         pass
#     else:
#         df = pd.concat([df, part_hobbies], axis=1)
        
#     # Linking tables with data as one hot encoder
#     df = pd.concat([df, hot_relationship_status], axis=1)
#     df = pd.concat([df, hot_credit_card_type], axis=1)

#     df = df.drop(columns=['location', 'occupation', 'credit_card_type', 'hobbies', 'relationship_status']) # Deleting columns

#     return df

In [None]:
X_train.head()

In [None]:
# X_train_trans = transform_function(X_train, True)
X_train_trans = t_function(X_train)

In [None]:
X_train_trans.head()

Training the model on the training group and then applying the model to the data for the group with barracking data. -> creating a df with the results  
Each column has its own Logistic regression model

Zmienne ilościowe, wartości odstające - sprawdzić skośność 

In [None]:
X_train_trans[['location_population','friends_number', 'dob', 'daily_commute']].skew()

X_train_trans.describe()

In [None]:
X_train_trans['location_population'].max()

In [None]:
X_train_trans['location_population'].mean()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Przykładowe dane - zakładam, że masz już załadowane dane w zmiennej X_train_trans
# X_train_trans = ...

# Wybieram tylko kolumny do analizy
columns_to_analyze = ['location_population', 'friends_number', 'dob', 'daily_commute']
data_to_analyze = X_train_trans[columns_to_analyze]

# Tworzę wykresy
plt.figure(figsize=(12, 6))

# Tworzenie histogramów dla każdej zmiennej
for i, column in enumerate(columns_to_analyze, 1):
    plt.subplot(2, 2, i)
    sns.histplot(data_to_analyze[column], kde=True)
    plt.title(f'Histogram: {column}')

plt.subplots_adjust(hspace=0.5, wspace=0.3)

# Tworzenie wykresu skośności (skewness)
skewness_values = data_to_analyze.skew()
plt.figure(figsize=(8, 4))
sns.barplot(x=skewness_values.index, y=skewness_values.values)
plt.title('Data skewness')
plt.xticks(rotation=45)
plt.ylabel('Skewness value')

plt.tight_layout()
plt.show()


In [None]:
X_train_trans.head()

Training the models  

Each hobby for which we want to get a forecast has its own model 

y_train - zamineić na jedną kolumnę 

In [None]:
y_train.columns

dodać RFE

In [None]:
trained_models = {} # Dict with models


for hobby in y_train.columns:
    model = LogisticRegression()
    model.fit(X_train_trans, np.array(y_train[hobby].values))
    trained_models[hobby] = model # save trained models to dict

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import numpy as np

trained_models = {}  # Dict with models

# Assuming you have X_train_trans and y_train defined
# X_train_trans is your feature matrix
# y_train is your target variable dataframe

for hobby in y_train.columns:
    model = LogisticRegression()
    model.fit(X_train_trans, np.array(y_train[hobby].values))
    trained_models[hobby] = model  # save trained models to dict

def apply_rfe(trained_models, X_train, num_features_to_select=5):
    selected_features = {}

    for hobby, model in trained_models.items():
        rfe = RFE(model, n_features_to_select=num_features_to_select)
        rfe.fit(X_train, np.array(y_train[hobby].values))
        selected_features[hobby] = {
            "selected_features": X_train.columns[rfe.support_],
            "ranking": rfe.ranking_
        }

    return selected_features

# Call the function and pass your X_train_trans
selected_features = apply_rfe(trained_models, X_train_trans)

# Now you can access the selected features and their rankings for each hobby
for hobby, features_info in selected_features.items():
    print(f"Hobby: {hobby}")
    print("Selected Features:", features_info['selected_features'])
    print("Feature Ranking:", features_info['ranking'])
    print()


Application of trained models

In [None]:
selected_features

In [None]:
pred_hobby_train = pd.DataFrame()

for hobby, model in trained_models.items():
    pred = model.predict(X_train_trans)
    pred_hobby_train[hobby] = pred


Dodać drzewa decyzyjne, XGBoost

Przewidywania tragiczne 

In [None]:
pred_hobby_train.sum()

Czemu wynik jest tak wysoki??

## Checking the results for the training set

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_train, pred_hobby_train)
print("Prediction precision:", accuracy)


## Checking the results for the test set

In [None]:
# X_test_trans = transform_function(X_test, True)
X_test_trans = t_function(X_test)

In [None]:
pred_hobby_test = pd.DataFrame()

for hobby, model in trained_models.items():
    pred = model.predict(X_test_trans)
    pred_hobby_test[hobby] = pred

In [None]:
accuracy = accuracy_score(y_test, pred_hobby_test)
print("Prediction precision:", accuracy)

Saving models to a file in piccle format

In [None]:
with open('models/all_trained_hobby_models.pkl', 'wb') as f:
    pickle.dump(trained_models, f)
