In [1]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from xgboost import XGBClassifier
import emoji
from sklearn.svm import SVC

In [2]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')
train

Unnamed: 0,gender,age,fullname,username,biography,follower_count,following_count,is_business,is_verified,is_private
0,man,2,Farshid,mr_gh_farshid,دردا ک در این بادیه بسیار دویدیم...\nGlory man...,1604.0,1407.0,0.0,0.0,0.0
1,woman,2,zahr@72,zahra.roozbahani72,"خواهی که زکوچ در امان برگردی\nباید که به جان ,...",67.0,501.0,0.0,0.0,0.0
2,woman,2,ms farahnaz♥,___lady.farahnazi.__,"Having you, is all I wish for \nداشتنت، تمامِ...",0.0,0.0,0.0,0.0,0.0
3,woman,1,Lena.mommy farzan,mommy.lena3361,دردونه من لنا کوچولو,0.0,0.0,0.0,0.0,0.0
4,woman,2,Narsis Asadollahi,_l.aurora.l_,I am an animation student\n🎧🎼🎨⚓️🤍 \n@general.m...,200.0,328.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
7995,woman,2,Ŋεgɨŋ,negiiin_bahrmandi,﷽\nAllah IS Enough FoR Me?\n♥️?,0.0,0.0,0.0,0.0,0.0
7996,man,3,h🗯abdi🗯offìcial,h.abdi.official,حقوقی,0.0,0.0,1.0,0.0,0.0
7997,woman,2,⚜رویا احمدی⚜,roya.ahmadi.k,مهندس صنایع👩‍🔧🏭 Industrial engineer\nمعمار👩‍💻👷...,1260.0,1167.0,0.0,0.0,0.0
7998,man,3,لرستان &خرم اباد,erfanpouersif,khoramabad,0.0,0.0,0.0,0.0,0.0


### Preprocessing 

In [3]:
# do some preprocessing :)
train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   gender           8000 non-null   object 
 1   age              8000 non-null   int64  
 2   fullname         8000 non-null   object 
 3   username         8000 non-null   object 
 4   biography        8000 non-null   object 
 5   follower_count   8000 non-null   float64
 6   following_count  8000 non-null   float64
 7   is_business      7997 non-null   float64
 8   is_verified      8000 non-null   float64
 9   is_private       8000 non-null   float64
dtypes: float64(5), int64(1), object(4)
memory usage: 625.1+ KB


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              2000 non-null   int64  
 1   fullname         2000 non-null   object 
 2   username         2000 non-null   object 
 3   biography        2000 non-null   object 
 4   follower_count   2000 non-null   float64
 5   following_count  2000 non-null   float64
 6   is_business      2000 non-null   float64
 7   is_verified      2000 non-null   float64
 8   is_private       2000 non-null   float64
dtypes: float64(5), int64(1), object(3)
memory usage: 140.8+ KB


In [5]:
train.groupby(['gender'])['is_business'].value_counts()

gender  is_business
man     0.0            3475
        1.0             524
woman   0.0            3696
        1.0             302
Name: is_business, dtype: int64

In [6]:
train.groupby(['gender'])['is_private'].value_counts()

gender  is_private
man     0.0           4000
woman   0.0           3999
        1.0              1
Name: is_private, dtype: int64

In [7]:
train.groupby(['gender'])['follower_count'].mean()

gender
man      2963.39175
woman    2527.00875
Name: follower_count, dtype: float64

In [8]:
train.groupby(['gender'])['following_count'].mean()

gender
man      900.5085
woman    523.9045
Name: following_count, dtype: float64

In [9]:
train['is_business'].fillna(train['is_business'].mode()[0],inplace=True)

### Feature Extraction 

In [10]:
# Lenght of biography on Train data

train['len_bio'] = train['biography'].apply(len)
    
# Lenght of biography on Test data 

test['len_bio'] = test['biography'].apply(len)

In [11]:

# Define a function to count emojis in a string
def count_emojis(text):
    return emoji.emoji_count(text)

# Adding a new column to store the count of emojis in the 'biography' column
train['emoji_count'] = train['biography'].apply(count_emojis)
test['emoji_count']=test['biography'].apply(count_emojis)


### Feature Selection

In [12]:
X = train[['fullname', 'username','biography','len_bio','emoji_count','age','is_business','follower_count', 'following_count']]
y = train['gender']

### Implemention Of NLP 

##### I generate two model , First model Predict and Detect gender based on 'fullname', 'username','biography' ( Text model ) and i use the answer of the this model as a new feature for my Second model that is geneated by the others feature like 'len_bio','emoji_count','age', Etc.

In [14]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\benya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\benya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
# Text preprocessing for NLP analysis

def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords and punctuation
    tokens = [word.lower() for word in tokens if word.isalnum() and word.lower() not in stopwords.words('english')]
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)


In [16]:
# Apply text preprocessing to each column containing text data
X['fullname'] = X['fullname'].apply(preprocess_text)
X['username'] = X['username'].apply(preprocess_text)
X['biography'] = X['biography'].apply(preprocess_text)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['fullname'] = X['fullname'].apply(preprocess_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['username'] = X['username'].apply(preprocess_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['biography'] = X['biography'].apply(preprocess_text)


### First model : Text model based 

In [17]:

X_text_feature = X[['fullname', 'username','biography']] 
x_train, x_val, y_train, y_val = train_test_split(X_text_feature, y, test_size=0.2, random_state=42)

In [18]:
txt_model = SVC()
tfidf = TfidfVectorizer()

In [19]:

tfidf.fit(train.fullname + ' ' + train.username + ' ' + train.biography)
txt_model.fit(tfidf.transform(x_train.fullname + ' ' + x_train.username + ' ' + x_train.biography), y_train)
X['gender_by_txt'] = txt_model.predict(tfidf.transform(train.fullname + ' ' + train.username + ' ' + train.biography))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['gender_by_txt'] = txt_model.predict(tfidf.transform(train.fullname + ' ' + train.username + ' ' + train.biography))


### Second model 

In [20]:
# X and Y for new model based on predicted y from the last text based model 

X_sec = X[['gender_by_txt','len_bio','emoji_count','age','is_business','follower_count', 'following_count']]
y_sec = train['gender']

In [21]:
# Standardize numerical features

numerical_col=['len_bio','emoji_count','age','follower_count', 'following_count']
scaler = StandardScaler()
X_sec[numerical_col]=scaler.fit_transform(X_sec[numerical_col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_sec[numerical_col]=scaler.fit_transform(X_sec[numerical_col])


In [22]:
# Encoding Label 
le = LabelEncoder()
X_sec['gender_by_txt']=le.fit_transform(X_sec['gender_by_txt'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_sec['gender_by_txt']=le.fit_transform(X_sec['gender_by_txt'])


In [23]:
# Train / Test Split
X_train, X_val, Y_train, Y_val = train_test_split(X_sec, y_sec, test_size=0.2, random_state=42)

In [25]:
# Encode the labels 'man' and 'woman' to numerical values
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(Y_train)
y_val_encoded = label_encoder.transform(Y_val)

# Model training with different hyperparameters
xgb_classifier = XGBClassifier(
    learning_rate=0.4,
    max_depth=5,
    n_estimators=150,
    gamma=1,
    random_state=42,
    objective='binary:logistic',
    eval_metric='error',
    use_label_encoder=False
)
xgb_classifier.fit(X_train, y_train_encoded, eval_set=[(X_val, y_val_encoded)], verbose=False)

# Predictions
y_pred_encoded = xgb_classifier.predict(X_val)

# Inverse transform the predicted numerical values to 'man' and 'woman'
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Evaluation
print("Accuracy:", accuracy_score(y_val, y_pred))
#print(classification_report(y_val, y_pred))


Accuracy: 0.794375


#### The Accuracy of model on the final test data is : 80.3