# Import required libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

# Introduction to the dataset

<p dir=rtl style="direction: rtl; text-align: justify; line-height:200%; font-family:vazir; font-size:medium">
<font face="vazir" size=3>
The training dataset has 8000 rows and 10 columns. In the table below, more information is written about the data    
</font>
</p>

<center>
<div dir=rtl style="direction: rtl;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=3>
    
|ستون|توضیحات|
|:------:|:---:|
|gender|جنسیت (ستون هدف)|
|age|بازه سنی کاربر|
|fullname|نامی که در پروفایل شبکه اجتماعی نوشته شده است |
|username|نام کاربری|
|biography|بیوگرافی شبکه اجتماعی کاربر|
|follower_count|تعداد دنبال‌کننده‌های کاربر|
|following_count|تعداد افرادی که کاربر را دنبال می‌کنند|
|is_business|کسب‌وکاری بودن حساب کاربری|
|is_verified|تایید شده بودن حساب کاربری|
|is_private|خصوصی بودن حساب کاربری|
    
</font>
</div>
</center>


<p dir=rtl style="direction: rtl; text-align: justify; line-height:200%; font-family:vazir; font-size:medium">
<font face="vazir" size=3>
    The age column is not a continuous variable; Rather, it shows the age groups. In the table below, we see how age is mapped
</font>
</p>


<center>
<div dir=rtl style="direction: rtl;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=3>
    
|سن واقعی کاربران|عدد نگاشت شده|
|:------:|:---:|
|1|کمتر از ۱۸ سال|
|2|از ۱۹ تا ۲۹ سال|
|3|از ۳۰ تا ۴۰ سال|
|4|بالاتر از ۴۰ سال|
    
</font>
</div>
</center>


# Reading the dataset

In [3]:
train = pd.read_csv('../data/train_data.csv')

train

Unnamed: 0,gender,age,fullname,username,biography,follower_count,following_count,is_business,is_verified,is_private
0,man,2,Farshid,mr_gh_farshid,دردا ک در این بادیه بسیار دویدیم...\nGlory man...,1604.0,1407.0,0.0,0.0,0.0
1,woman,2,zahr@72,zahra.roozbahani72,"خواهی که زکوچ در امان برگردی\nباید که به جان ,...",67.0,501.0,0.0,0.0,0.0
2,woman,2,ms farahnaz♥,___lady.farahnazi.__,"Having you, is all I wish for \nداشتنت، تمامِ...",0.0,0.0,0.0,0.0,0.0
3,woman,1,Lena.mommy farzan,mommy.lena3361,دردونه من لنا کوچولو,0.0,0.0,0.0,0.0,0.0
4,woman,2,Narsis Asadollahi,_l.aurora.l_,I am an animation student\n🎧🎼🎨⚓️🤍 \n@general.m...,200.0,328.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
7995,woman,2,Ŋεgɨŋ,negiiin_bahrmandi,﷽\nAllah IS Enough FoR Me?\n♥️?,0.0,0.0,0.0,0.0,0.0
7996,man,3,h🗯abdi🗯offìcial,h.abdi.official,حقوقی,0.0,0.0,1.0,0.0,0.0
7997,woman,2,⚜رویا احمدی⚜,roya.ahmadi.k,مهندس صنایع👩‍🔧🏭 Industrial engineer\nمعمار👩‍💻👷...,1260.0,1167.0,0.0,0.0,0.0
7998,man,3,لرستان &خرم اباد,erfanpouersif,khoramabad,0.0,0.0,0.0,0.0,0.0


# Preprocessing

In [5]:
x_train = train.drop('gender', axis=1)
y_train = train['gender']
y_train = y_train.map({'man': 0, 'woman': 1})

In [None]:
tfidf = TfidfVectorizer()
tfidf.fit(x_train['fullname'] + ' ' + x_train['username'] + ' ' + x_train['biography'])

encoded_text_train = tfidf.transform(x_train['fullname'] + ' ' + x_train['username'] + ' ' + x_train['biography'])

g_s = GridSearchCV(RandomForestClassifier(), 
                {'n_estimators': [50, 75, 100], 'max_depth': [5, 10, 15, 20],
                    'min_samples_split':[1, 2, 3, 4]},
                    scoring='f1')
g_s.fit(encoded_text_train, y_train)
g_s.best_params_

{'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}

In [9]:
model = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=3)
model.fit(encoded_text_train, y_train)

gender_by_text_train = model.predict(encoded_text_train)

x_train['gender_by_text'] = gender_by_text_train

print(f1_score(y_train, gender_by_text_train))

0.8744690891930156


In [10]:
x_train.drop(columns=['fullname', 'username', 'biography'], axis=1, inplace=True)

In [11]:
x_train

Unnamed: 0,age,follower_count,following_count,is_business,is_verified,is_private,gender_by_text
0,2,1604.0,1407.0,0.0,0.0,0.0,0
1,2,67.0,501.0,0.0,0.0,0.0,1
2,2,0.0,0.0,0.0,0.0,0.0,1
3,1,0.0,0.0,0.0,0.0,0.0,1
4,2,200.0,328.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...
7995,2,0.0,0.0,0.0,0.0,0.0,1
7996,3,0.0,0.0,1.0,0.0,0.0,0
7997,2,1260.0,1167.0,0.0,0.0,0.0,1
7998,3,0.0,0.0,0.0,0.0,0.0,1


In [12]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=13)

x_train.reset_index(drop=True, inplace=True)
x_val.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)

In [13]:
x_train.isna().sum()

age                0
follower_count     0
following_count    0
is_business        3
is_verified        0
is_private         0
gender_by_text     0
dtype: int64

In [14]:
x_train['is_business'].fillna(x_train['is_business'].mode()[0], inplace=True)  

In [15]:
x_train.isna().sum()

age                0
follower_count     0
following_count    0
is_business        0
is_verified        0
is_private         0
gender_by_text     0
dtype: int64

# Modeling

In [17]:
g_s = GridSearchCV(RandomForestClassifier(), 
                {'n_estimators': [50, 75, 100], 'max_depth': [5, 10, 15, 20],
                    'min_samples_split':[1, 2, 3, 4]},
                    scoring='f1')

g_s.fit(x_train, y_train)

g_s.best_params_

{'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 100}

In [18]:
model = RandomForestClassifier(n_estimators=50, max_depth=5, min_samples_split=1)

model.fit(x_train, y_train)

y_train_pred = model.predict(x_train)
y_val_pred = model.predict(x_val)

# Evaluate model

In [21]:
print(classification_report(y_train, y_train_pred))
print(classification_report(y_val, y_val_pred))

print(f1_score(y_train, y_train_pred))
print(f1_score(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.92      0.83      0.87      3204
           1       0.84      0.92      0.88      3196

    accuracy                           0.87      6400
   macro avg       0.88      0.87      0.87      6400
weighted avg       0.88      0.87      0.87      6400

              precision    recall  f1-score   support

           0       0.90      0.80      0.85       796
           1       0.83      0.92      0.87       804

    accuracy                           0.86      1600
   macro avg       0.86      0.86      0.86      1600
weighted avg       0.86      0.86      0.86      1600

0.8805012680889154
0.8679245283018868
