### Load Libraries


In [1]:
import tensorflow as tf






In [3]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC as SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score

In [4]:
import re
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

### Data Preprocessing

In [5]:
# Load your dataset
df = pd.read_csv("author.csv", nrows=10000)

In [6]:
df=df[['Gender','Age Group','Content']]

In [7]:
df.head(10)

Unnamed: 0,Gender,Age Group,Content
0,female,30s,By deciding on a tidy organizational device yo...
1,female,20s,Each year we all make the same resolutions: to...
2,male,30s,For indoor utilizes these lights come in reduc...
3,female,30s,1. Look your best.<br />;If you're a Kurt Coba...
4,female,20s,For years tax debt resolution was about mislea...
5,female,30s,"<img class=""smiley"" src=""http://www.pan.net/sm..."
6,female,20s,"Its better a living Dog, than a dead Lion........"
7,female,30s,"<a href=""http://en.pan.netcom/go/out/url=-aHR0..."
8,female,20s,"For centuries, society has been slowly driftin..."
9,female,30s,Any homeowner that is currently dealing with a...


In [8]:
df = df[['Gender','Content']]  # Keep only relevant columns

In [9]:
df.head(10)

Unnamed: 0,Gender,Content
0,female,By deciding on a tidy organizational device yo...
1,female,Each year we all make the same resolutions: to...
2,male,For indoor utilizes these lights come in reduc...
3,female,1. Look your best.<br />;If you're a Kurt Coba...
4,female,For years tax debt resolution was about mislea...
5,female,"<img class=""smiley"" src=""http://www.pan.net/sm..."
6,female,"Its better a living Dog, than a dead Lion........"
7,female,"<a href=""http://en.pan.netcom/go/out/url=-aHR0..."
8,female,"For centuries, society has been slowly driftin..."
9,female,Any homeowner that is currently dealing with a...


### Removing numbers,html tags

In [10]:
def preprocess_text(text):
    text = re.sub(r'<\s*br\s*/?\s*>', '', text) # html tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d', '', text)  # Remove numbers
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

In [11]:
df = df[pd.notna(df['Content'])]

In [12]:
df['Processed Text'] = df['Content'].apply(lambda x: preprocess_text(x))

In [13]:
df.head(10)

Unnamed: 0,Gender,Content,Processed Text
0,female,By deciding on a tidy organizational device yo...,by deciding on a tidy organizational device yo...
1,female,Each year we all make the same resolutions: to...,each year we all make the same resolutions to ...
2,male,For indoor utilizes these lights come in reduc...,for indoor utilizes these lights come in reduc...
3,female,1. Look your best.<br />;If you're a Kurt Coba...,look your bestif youre a kurt cobain or johnny...
4,female,For years tax debt resolution was about mislea...,for years tax debt resolution was about mislea...
5,female,"<img class=""smiley"" src=""http://www.pan.net/sm...",img classsmiley srchttpwwwpannetsmiliessmilegi...
6,female,"Its better a living Dog, than a dead Lion........",its better a living dog than a dead lion
7,female,"<a href=""http://en.pan.netcom/go/out/url=-aHR0...",a hrefhttpenpannetcomgoouturlahrcdovlddytzwxyb...
8,female,"For centuries, society has been slowly driftin...",for centuries society has been slowly drifting...
9,female,Any homeowner that is currently dealing with a...,any homeowner that is currently dealing with a...


### BERT

#### Bert tokenizer


In [14]:
# BERT Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 32  # You can adjust this based on your requirements

In [15]:
def tokenize_text(text):
    tokens = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='tf',
        truncation=True
    )
    return tokens

df['Tokenized Text'] = df['Processed Text'].apply(lambda x: tokenize_text(x))

# BERT Embeddings
model = TFBertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(tokens):
    outputs = model(tokens['input_ids'])
    return outputs.last_hidden_state





Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

#### BERT word embeddings

In [16]:
df['BERT Embeddings']= df['Tokenized Text'].apply(lambda x: get_bert_embeddings(x))

#### flattening bert embeddings

In [17]:
df['Flattened Embeddings'] = df['BERT Embeddings'].apply(lambda x: tf.reduce_mean(x, axis=1).numpy())

#### Concatenates the flattened embeddings with the original DataFrame.

In [18]:
# Stack the lists vertically to create a 2D array
embedding_array = np.vstack(df['Flattened Embeddings'])

# Create a new DataFrame with flattened BERT embeddings
embedding_df = pd.DataFrame(embedding_array, index=df.index)

# Concatenate the new DataFrame with the original DataFrame
df = pd.concat([df, embedding_df], axis=1)

In [19]:
df.head(10)

Unnamed: 0,Gender,Content,Processed Text,Tokenized Text,BERT Embeddings,Flattened Embeddings,0,1,2,3,...,758,759,760,761,762,763,764,765,766,767
0,female,By deciding on a tidy organizational device yo...,by deciding on a tidy organizational device yo...,"[input_ids, token_type_ids, attention_mask]","(((tf.Tensor(-0.31977496, shape=(), dtype=floa...","[[-0.40822378, 0.16020557, 0.10844434, 0.19822...",-0.408224,0.160206,0.108444,0.198225,...,-0.266288,-0.063559,0.106448,-0.118517,-0.158895,0.048908,-0.257749,-0.48819,-0.015447,0.066853
1,female,Each year we all make the same resolutions: to...,each year we all make the same resolutions to ...,"[input_ids, token_type_ids, attention_mask]","(((tf.Tensor(-0.2841652, shape=(), dtype=float...","[[0.19204016, 0.20596215, 0.43610308, -0.03095...",0.19204,0.205962,0.436103,-0.03096,...,0.1158,-0.147999,-0.045669,-0.2674,0.188941,0.121196,-0.140103,-0.574369,0.222512,-0.368437
2,male,For indoor utilizes these lights come in reduc...,for indoor utilizes these lights come in reduc...,"[input_ids, token_type_ids, attention_mask]","(((tf.Tensor(0.2705576, shape=(), dtype=float3...","[[0.14549167, 0.13775417, 0.32200363, 0.113609...",0.145492,0.137754,0.322004,0.11361,...,-0.033398,-0.5603,0.001773,-0.174417,0.023769,0.076998,0.1596,-0.334436,0.227025,-0.050596
3,female,1. Look your best.<br />;If you're a Kurt Coba...,look your bestif youre a kurt cobain or johnny...,"[input_ids, token_type_ids, attention_mask]","(((tf.Tensor(0.34701687, shape=(), dtype=float...","[[0.2837627, 0.041847527, -0.09660588, -0.1172...",0.283763,0.041848,-0.096606,-0.117241,...,-0.077965,-0.201927,0.166526,0.055085,-0.255779,0.110046,-0.239577,-0.451239,0.307221,-0.256727
4,female,For years tax debt resolution was about mislea...,for years tax debt resolution was about mislea...,"[input_ids, token_type_ids, attention_mask]","(((tf.Tensor(-0.31031987, shape=(), dtype=floa...","[[0.044066515, 0.22678693, 0.4011359, 0.164919...",0.044067,0.226787,0.401136,0.16492,...,-0.026658,-0.312066,0.017293,-0.301495,0.049524,-0.128189,-0.108095,-0.221666,0.073846,0.105643
5,female,"<img class=""smiley"" src=""http://www.pan.net/sm...",img classsmiley srchttpwwwpannetsmiliessmilegi...,"[input_ids, token_type_ids, attention_mask]","(((tf.Tensor(-0.13637497, shape=(), dtype=floa...","[[0.03602384, 0.028696936, 0.69203246, 0.04745...",0.036024,0.028697,0.692032,0.047457,...,0.063446,-0.353052,-0.16589,-0.320965,0.271925,0.394788,0.169672,-0.436163,-0.094736,0.044685
6,female,"Its better a living Dog, than a dead Lion........",its better a living dog than a dead lion,"[input_ids, token_type_ids, attention_mask]","(((tf.Tensor(-0.5687056, shape=(), dtype=float...","[[-0.007941224, -0.24636611, 0.50431967, 0.024...",-0.007941,-0.246366,0.50432,0.024584,...,-0.119097,0.096592,0.105595,0.150402,0.493145,-0.429353,-0.017817,-0.42142,0.248153,-0.65667
7,female,"<a href=""http://en.pan.netcom/go/out/url=-aHR0...",a hrefhttpenpannetcomgoouturlahrcdovlddytzwxyb...,"[input_ids, token_type_ids, attention_mask]","(((tf.Tensor(0.058876023, shape=(), dtype=floa...","[[0.14540425, 0.09266676, 0.23226413, -0.09741...",0.145404,0.092667,0.232264,-0.097417,...,0.007749,-0.185595,0.03377,-0.593248,-0.279917,-0.405872,-0.08762,-0.263684,-0.082102,-0.091261
8,female,"For centuries, society has been slowly driftin...",for centuries society has been slowly drifting...,"[input_ids, token_type_ids, attention_mask]","(((tf.Tensor(0.11305003, shape=(), dtype=float...","[[-0.11437962, -0.11993228, 0.3898145, -0.0363...",-0.11438,-0.119932,0.389814,-0.036342,...,-0.079416,-0.103932,-0.033183,-0.285232,0.18974,-0.659655,0.167518,-0.265183,0.065491,0.026417
9,female,Any homeowner that is currently dealing with a...,any homeowner that is currently dealing with a...,"[input_ids, token_type_ids, attention_mask]","(((tf.Tensor(0.05225724, shape=(), dtype=float...","[[-0.14339556, 0.4114551, 0.277157, -0.1753241...",-0.143396,0.411455,0.277157,-0.175324,...,-0.043816,-0.322101,0.12447,-0.153642,-0.220502,-0.021754,-0.021267,-0.511311,-0.147154,-0.029635


### String Indexer

In [20]:
from sklearn.preprocessing import LabelEncoder

In [21]:
column_to_index = 'Gender'
label_encoder = LabelEncoder()
df[column_to_index + '_indexed'] = label_encoder.fit_transform(df[column_to_index])

In [22]:
df.head(10)

Unnamed: 0,Gender,Content,Processed Text,Tokenized Text,BERT Embeddings,Flattened Embeddings,0,1,2,3,...,759,760,761,762,763,764,765,766,767,Gender_indexed
0,female,By deciding on a tidy organizational device yo...,by deciding on a tidy organizational device yo...,"[input_ids, token_type_ids, attention_mask]","(((tf.Tensor(-0.31977496, shape=(), dtype=floa...","[[-0.40822378, 0.16020557, 0.10844434, 0.19822...",-0.408224,0.160206,0.108444,0.198225,...,-0.063559,0.106448,-0.118517,-0.158895,0.048908,-0.257749,-0.48819,-0.015447,0.066853,0
1,female,Each year we all make the same resolutions: to...,each year we all make the same resolutions to ...,"[input_ids, token_type_ids, attention_mask]","(((tf.Tensor(-0.2841652, shape=(), dtype=float...","[[0.19204016, 0.20596215, 0.43610308, -0.03095...",0.19204,0.205962,0.436103,-0.03096,...,-0.147999,-0.045669,-0.2674,0.188941,0.121196,-0.140103,-0.574369,0.222512,-0.368437,0
2,male,For indoor utilizes these lights come in reduc...,for indoor utilizes these lights come in reduc...,"[input_ids, token_type_ids, attention_mask]","(((tf.Tensor(0.2705576, shape=(), dtype=float3...","[[0.14549167, 0.13775417, 0.32200363, 0.113609...",0.145492,0.137754,0.322004,0.11361,...,-0.5603,0.001773,-0.174417,0.023769,0.076998,0.1596,-0.334436,0.227025,-0.050596,1
3,female,1. Look your best.<br />;If you're a Kurt Coba...,look your bestif youre a kurt cobain or johnny...,"[input_ids, token_type_ids, attention_mask]","(((tf.Tensor(0.34701687, shape=(), dtype=float...","[[0.2837627, 0.041847527, -0.09660588, -0.1172...",0.283763,0.041848,-0.096606,-0.117241,...,-0.201927,0.166526,0.055085,-0.255779,0.110046,-0.239577,-0.451239,0.307221,-0.256727,0
4,female,For years tax debt resolution was about mislea...,for years tax debt resolution was about mislea...,"[input_ids, token_type_ids, attention_mask]","(((tf.Tensor(-0.31031987, shape=(), dtype=floa...","[[0.044066515, 0.22678693, 0.4011359, 0.164919...",0.044067,0.226787,0.401136,0.16492,...,-0.312066,0.017293,-0.301495,0.049524,-0.128189,-0.108095,-0.221666,0.073846,0.105643,0
5,female,"<img class=""smiley"" src=""http://www.pan.net/sm...",img classsmiley srchttpwwwpannetsmiliessmilegi...,"[input_ids, token_type_ids, attention_mask]","(((tf.Tensor(-0.13637497, shape=(), dtype=floa...","[[0.03602384, 0.028696936, 0.69203246, 0.04745...",0.036024,0.028697,0.692032,0.047457,...,-0.353052,-0.16589,-0.320965,0.271925,0.394788,0.169672,-0.436163,-0.094736,0.044685,0
6,female,"Its better a living Dog, than a dead Lion........",its better a living dog than a dead lion,"[input_ids, token_type_ids, attention_mask]","(((tf.Tensor(-0.5687056, shape=(), dtype=float...","[[-0.007941224, -0.24636611, 0.50431967, 0.024...",-0.007941,-0.246366,0.50432,0.024584,...,0.096592,0.105595,0.150402,0.493145,-0.429353,-0.017817,-0.42142,0.248153,-0.65667,0
7,female,"<a href=""http://en.pan.netcom/go/out/url=-aHR0...",a hrefhttpenpannetcomgoouturlahrcdovlddytzwxyb...,"[input_ids, token_type_ids, attention_mask]","(((tf.Tensor(0.058876023, shape=(), dtype=floa...","[[0.14540425, 0.09266676, 0.23226413, -0.09741...",0.145404,0.092667,0.232264,-0.097417,...,-0.185595,0.03377,-0.593248,-0.279917,-0.405872,-0.08762,-0.263684,-0.082102,-0.091261,0
8,female,"For centuries, society has been slowly driftin...",for centuries society has been slowly drifting...,"[input_ids, token_type_ids, attention_mask]","(((tf.Tensor(0.11305003, shape=(), dtype=float...","[[-0.11437962, -0.11993228, 0.3898145, -0.0363...",-0.11438,-0.119932,0.389814,-0.036342,...,-0.103932,-0.033183,-0.285232,0.18974,-0.659655,0.167518,-0.265183,0.065491,0.026417,0
9,female,Any homeowner that is currently dealing with a...,any homeowner that is currently dealing with a...,"[input_ids, token_type_ids, attention_mask]","(((tf.Tensor(0.05225724, shape=(), dtype=float...","[[-0.14339556, 0.4114551, 0.277157, -0.1753241...",-0.143396,0.411455,0.277157,-0.175324,...,-0.322101,0.12447,-0.153642,-0.220502,-0.021754,-0.021267,-0.511311,-0.147154,-0.029635,0


### Data Splitting

In [23]:
# Split the data into training and testing sets
X = np.vstack(df['Flattened Embeddings'].apply(lambda x: x.flatten()).to_numpy())
y = df['Gender_indexed']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Logistic Regression

In [24]:
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)
logistic_regression_predictions = logistic_regression_model.predict(X_test)

logistic_regression_accuracy = accuracy_score(y_test, logistic_regression_predictions)
precision_lr = precision_score(y_test, logistic_regression_predictions)
recall_lr = recall_score(y_test,logistic_regression_predictions)
f1_lr= f1_score(y_test,logistic_regression_predictions)
print("Logistic Regression Accuracy :", "{:.2f}%".format(100*logistic_regression_accuracy))


Logistic Regression Accuracy : 53.43%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### KNN

In [25]:
kneighbors_model = KNeighborsClassifier()
kneighbors_model.fit(X_train, y_train)
kneighbors_predictions = kneighbors_model.predict(X_test)

kneighbors_accuracy = accuracy_score(y_test, kneighbors_predictions)
precision_knn = precision_score(y_test, kneighbors_predictions)
recall_knn = recall_score(y_test,kneighbors_predictions)
f1_knn= f1_score(y_test, kneighbors_predictions)
print("KNN Accuracy :", "{:.2f}%".format(100*kneighbors_accuracy))


KNN Accuracy : 51.83%


### Naive Bayes

In [26]:
naive_bayes_model = GaussianNB()
naive_bayes_model.fit(X_train, y_train)
naive_bayes_predictions = naive_bayes_model.predict(X_test)

naive_bayes_accuracy = accuracy_score(y_test, naive_bayes_predictions)
precision_nb = precision_score(y_test, naive_bayes_predictions)
recall_nb= recall_score(y_test,naive_bayes_predictions)
f1_nb= f1_score(y_test, naive_bayes_predictions)
print("naive bayes accuracy :", "{:.2f}%".format(100*naive_bayes_accuracy))

naive bayes accuracy : 52.13%


### SVM

In [27]:
svc_model = SVC()
svc_model.fit(X_train, y_train)
svc_predictions = svc_model.predict(X_test)

svc_accuracy = accuracy_score(y_test, svc_predictions)
precision_svm = precision_score(y_test, svc_predictions)
recall_svm = recall_score(y_test,svc_predictions)
f1_svm= f1_score(y_test, svc_predictions)
print("svm accuracy :", "{:.2f}%".format(100*svc_accuracy))

svm accuracy : 53.63%




### Random Forest

In [28]:
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)
random_forest_predictions = random_forest_model.predict(X_test)

random_forest_accuracy = accuracy_score(y_test, random_forest_predictions)
precision_rf = precision_score(y_test, random_forest_predictions)
recall_rf = recall_score(y_test,random_forest_predictions)
f1_rf= f1_score(y_test, random_forest_predictions)
print("Random Forest accuracy:", "{:.2f}%".format(100*random_forest_accuracy))

Random Forest accuracy: 53.50%


### Decision Tree

In [29]:
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)
decision_tree_predictions = decision_tree_model.predict(X_test)

decision_tree_accuracy = accuracy_score(y_test, decision_tree_predictions)
precision_dt = precision_score(y_test, decision_tree_predictions)
recall_dt = recall_score(y_test, decision_tree_predictions)
f1_dt= f1_score(y_test, decision_tree_predictions)
print("decision Tree accuracy:", "{:.2f}%".format(100*decision_tree_accuracy))

decision Tree accuracy: 53.10%


### Adaboost

In [30]:
from sklearn.ensemble import AdaBoostClassifier

adaboost_model = AdaBoostClassifier()
adaboost_model.fit(X_train, y_train)
adaboost_predictions = adaboost_model.predict(X_test)

adaboost_accuracy = accuracy_score(y_test, adaboost_predictions)
precision_ada = precision_score(y_test, adaboost_predictions)
recall_ada = recall_score(y_test, adaboost_predictions)
f1_ada= f1_score(y_test, adaboost_predictions)
print("Adaboost Accuracy :", "{:.2f}%".format(100*adaboost_accuracy))

Adaboost Accuracy : 51.60%


In [31]:
comparision_table = pd.DataFrame({
    'Model': ['Logistic Regression', 'K-neighbours','Naive Bayes','SVM','Random Forest','Decision Tree','Ada Boost' ],
    'Accuracy': [logistic_regression_accuracy,kneighbors_accuracy,naive_bayes_accuracy,svc_accuracy,random_forest_accuracy,decision_tree_accuracy,adaboost_accuracy],
    'Precision':[precision_lr,precision_knn,precision_nb ,precision_svm,precision_rf,precision_dt,precision_ada],
        'Recall':[recall_lr,recall_knn,recall_nb ,recall_svm,recall_rf,recall_dt,recall_ada],
            'F1':[f1_lr,f1_knn,f1_nb ,f1_svm,f1_rf,f1_dt,f1_ada]
})

comparision_table

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.534333,0.540414,0.532938,0.53665
1,K-neighbours,0.518333,0.524881,0.507246,0.515913
2,Naive Bayes,0.521333,0.562691,0.242424,0.338858
3,SVM,0.536333,0.542589,0.532938,0.53772
4,Random Forest,0.535,0.546067,0.480237,0.511041
5,Decision Tree,0.531,0.538622,0.509881,0.523858
6,Ada Boost,0.516,0.521797,0.520422,0.521108
