## A self-made logistic Lasso model was implemented to solve the task of Avito School of Analysts
# (without explanation)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
class LassoLogisticRegression:

    def __init__(self, max_iter=5e3, lr=0.04, tol=0.001, l1_coef=0.1):

        self.max_iter = max_iter
        self.lr = lr
        self.tol = tol
        self.l1_coef = l1_coef
        self.weights = None
        self.bias = None

    def fit(self, X_train, y_train):


        n, m = X_train.shape

        self.weights = np.zeros((m, 1))
        self.bias = np.mean(y_train)

        n_iter = 0
        gradient_norm = np.inf

        while n_iter < self.max_iter and gradient_norm > self.tol:

            dJdw, dJdb = self.grads(X_train, y_train)

            gradient_norm = np.linalg.norm(np.hstack([dJdw.flatten(), [dJdb]]))

            self.weights = self.weights - self.lr * dJdw
            self.bias = self.bias - self.lr * dJdb

            n_iter += 1

        return self

    def predict(self, X):
        return list(map(lambda x: int(x[0] > 0.5764479042420024), self.predict_proba(X)))


    def predict_proba(self, X):
        return LassoLogisticRegression.sigmoid(np.dot(X, self.weights) + self.bias)

    def grads(self, X, y):

        y_hat = self.predict_proba(X)

        dJdw = np.mean(X * (y_hat - y) + self.l1_coef * np.sign(self.weights.T), axis=0, keepdims=True).T
        dJdb = np.mean(y_hat - y)

        return dJdw, dJdb

    @staticmethod
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))


def accuracy(y_true, y_pred):
    correct_predictions = np.sum(y_true == y_pred)
    total_predictions = len(y_true)

    return correct_predictions / total_predictions

In [None]:
train_data = pd.read_csv("/content/binary_clf_data.csv")
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8923 entries, 0 to 8922
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   gender              8923 non-null   object 
 1   user_id             8923 non-null   int64  
 2   category_id         8923 non-null   int64  
 3   category_name       8923 non-null   object 
 4   subcategory_id      8923 non-null   int64  
 5   subcategory_name    8923 non-null   object 
 6   param1              8570 non-null   object 
 7   param2              3395 non-null   object 
 8   param3              2119 non-null   object 
 9   param1_microcat_id  8570 non-null   float64
 10  param2_microcat_id  3395 non-null   float64
 11  param3_microcat_id  2119 non-null   float64
dtypes: float64(3), int64(3), object(6)
memory usage: 836.7+ KB


In [None]:
train_data['param3'].nunique()

214

In [None]:
test_data = pd.read_csv('/content/dataset_527992_9.txt')
test_data

Unnamed: 0,user_id,category_id,category_name,subcategory_id,subcategory_name,param1,param2,param3,param1_microcat_id,param2_microcat_id,param3_microcat_id
0,154189609,7,Хобби и отдых,4,Велосипеды,Детские,,,1246.0,,
1,154189609,17,Бытовая электроника,29,Товары для компьютера,Комплектующие,Видеокарты,,1651.0,6491.0,
2,154189609,17,Бытовая электроника,41,Телефоны,iPhone,,,7315.0,,
3,154189609,17,Бытовая электроника,41,Телефоны,Xiaomi,,,10750009.0,,
4,154189728,28,Личные вещи,22,"Одежда, обувь, аксессуары",Женская одежда,Верхняя одежда,46–48 (L),1989.0,6949.0,775.0
...,...,...,...,...,...,...,...,...,...,...,...
3185,154263224,250003,Работа,250004,Резюме,"Без опыта, студенты",,,2179.0,,
3186,154265872,28,Личные вещи,47,Товары для детей и игрушки,Игрушки,,,5428.0,,
3187,154265872,28,Личные вещи,47,Товары для детей и игрушки,Игрушки,,,5428.0,,
3188,154265872,28,Личные вещи,47,Товары для детей и игрушки,Игрушки,,,5428.0,,


In [None]:
train_data['is_train'] = 1
test_data['is_train'] = 0
combined_data = pd.concat([train_data, test_data], axis=0)
combined_data

Unnamed: 0,gender,user_id,category_id,category_name,subcategory_id,subcategory_name,param1,param2,param3,param1_microcat_id,param2_microcat_id,param3_microcat_id,is_train
0,male,165823598,1,Для дома и дачи,50,Ремонт и строительство,Стройматериалы,,,981.0,,,1
1,male,165823598,1,Для дома и дачи,50,Ремонт и строительство,Стройматериалы,,,981.0,,,1
2,male,165823598,28,Личные вещи,22,"Одежда, обувь, аксессуары",Мужская одежда,Обувь,45,3285.0,6813.0,6074.0,1
3,male,165823598,1,Для дома и дачи,38,Мебель и интерьер,Шкафы и комоды,,,4533.0,,,1
4,male,154189396,1,Для дома и дачи,38,Мебель и интерьер,Кухонные гарнитуры,,,7506.0,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3185,,154263224,250003,Работа,250004,Резюме,"Без опыта, студенты",,,2179.0,,,0
3186,,154265872,28,Личные вещи,47,Товары для детей и игрушки,Игрушки,,,5428.0,,,0
3187,,154265872,28,Личные вещи,47,Товары для детей и игрушки,Игрушки,,,5428.0,,,0
3188,,154265872,28,Личные вещи,47,Товары для детей и игрушки,Игрушки,,,5428.0,,,0


In [None]:
interim_data = combined_data.drop(columns = ['category_id', 'subcategory_id', 'param1_microcat_id', 'param2_microcat_id', 'param3_microcat_id'])
interim_data = pd.get_dummies(data = interim_data, columns= ['category_name', 'subcategory_name','param1','param2', 'param3'])
ids = interim_data['user_id']
gender = interim_data['gender']
ones_count = (interim_data==1).sum()
columns_to_keep = ones_count[ones_count >= 10].index
interim_data = interim_data[columns_to_keep]
interim_data['user_id'] = ids
interim_data['gender'] = gender
interim_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12113 entries, 0 to 3189
Columns: 346 entries, is_train to gender
dtypes: bool(343), int64(2), object(1)
memory usage: 4.3+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interim_data['user_id'] = ids
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interim_data['gender'] = gender


In [None]:
train_data_encoded = interim_data[interim_data['is_train'] == 1].drop(columns=['is_train'])
test_data_encoded = interim_data[interim_data['is_train'] == 0].drop(columns=['is_train'])
train_data_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8923 entries, 0 to 8922
Columns: 345 entries, category_name_Бытовая электроника to gender
dtypes: bool(343), int64(1), object(1)
memory usage: 3.1+ MB


In [None]:
gender_aggregated = train_data_encoded.groupby('user_id')['gender'].first()
gender_aggregated = gender_aggregated.map({'male': 1, 'female': 0})
grouped_data = train_data_encoded.groupby(by = 'user_id').sum(numeric_only = bool)

In [None]:
grouped_data['gender'] = gender_aggregated
grouped_data = grouped_data.reset_index()
print(grouped_data.shape)

(1916, 345)


In [None]:
grouped_data

Unnamed: 0,user_id,category_name_Бытовая электроника,category_name_Для бизнеса,category_name_Для дома и дачи,category_name_Животные,category_name_Личные вещи,category_name_Недвижимость,category_name_Работа,category_name_Транспорт,category_name_Услуги,...,param3_Без размера,param3_Вторичка,param3_Двигатель,param3_Кузов,param3_Подвеска,param3_Салон,param3_Система охлаждения,param3_Трансмиссия и привод,param3_Электрооборудование,gender
0,154189396,0,0,2,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,154189658,0,2,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,154189676,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,154189693,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,154189709,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1911,154273019,10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1912,154273038,0,0,0,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1913,154273048,0,0,0,0,0,0,1,0,2,...,0,0,0,0,0,0,0,0,0,0
1914,154273196,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
X = grouped_data.drop(columns=['gender', 'user_id']).values
Y = grouped_data['gender'].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_val, Y_train, Y_val = train_test_split(X_scaled, Y, test_size=0.1, random_state=42, stratify=Y.flatten())
Y_train = Y_train.reshape(-1,1)
Y_val = Y_val.reshape(-1,1)
model = LassoLogisticRegression(l1_coef=0.003, lr=0.001, tol=0.0000000001, max_iter=5e3).fit(X_train=X_train, y_train=Y_train)
print(accuracy(model.predict(X_val), Y_val.flatten()))

0.71875


In [None]:
from sklearn.metrics import roc_curve, auc
y_true = Y_val.flatten()
y_scores = model.predict_proba(X_val)
fpr, tpr, thresholds = roc_curve(y_true, y_scores)
roc_auc = auc(fpr, tpr)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
optimal_threshold

0.5702855355794799

In [None]:
test_data_processed = test_data_encoded.groupby(by = 'user_id').sum(numeric_only = bool).reset_index()

In [None]:
user_ids = test_data_processed['user_id']

X_test = test_data_processed.drop(columns=['user_id']).values
X_scaled_test = scaler.fit_transform(X_test)

predicted_gender = model.predict(X_scaled_test)
predicted_gender_series = pd.Series(predicted_gender)

predicted_gender_series = predicted_gender_series.map({1: 'male', 0: 'female'})

predictions_df = pd.DataFrame({
    'user_id': user_ids,
    'gender': predicted_gender_series
})

predictions_df.to_csv('test_predictions.csv', index=False)