
### **Importing Libraries**

In [3]:
import os
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from sklearn.impute import SimpleImputer
from bs4 import BeautifulSoup
import re,string,unicodedata
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix


## Loding Dataset

In [4]:
data=pd.read_csv('author.csv',nrows=10000)

In [5]:
data.head(10)

Unnamed: 0,Gender,Age Group,Content
0,female,30s,By deciding on a tidy organizational device yo...
1,female,20s,Each year we all make the same resolutions: to...
2,male,30s,For indoor utilizes these lights come in reduc...
3,female,30s,1. Look your best.<br />;If you're a Kurt Coba...
4,female,20s,For years tax debt resolution was about mislea...
5,female,30s,"<img class=""smiley"" src=""http://www.pan.net/sm..."
6,female,20s,"Its better a living Dog, than a dead Lion........"
7,female,30s,"<a href=""http://en.pan.netcom/go/out/url=-aHR0..."
8,female,20s,"For centuries, society has been slowly driftin..."
9,female,30s,Any homeowner that is currently dealing with a...


### Drop rows with missing Values in content

In [6]:
missing_values = data['Content'].isnull().sum()
missing_values

0

In [7]:
data = data.dropna(subset=['Content'])

### Null values in the dataset

In [8]:
null_values = data.isnull().sum()

In [9]:
print('There are {} missing values for {} and {} missing values for {} {} {}.'.format(null_values[0],null_values.index[0],null_values[1],null_values.index[1],null_values[2],null_values.index[2]))

There are 0 missing values for Gender and 0 missing values for Age Group 0 Content.


### Duplicate Rows

In [10]:
num_duplicates = data.duplicated().sum()
print('There are {} duplicatetext present in the dataset'.format(num_duplicates))

There are 16 duplicatetext present in the dataset


In [11]:
data.drop_duplicates(inplace = True)

### Remaining Rows

In [12]:
print('The dataset contains {} rows and {} columns after removing duplicates'.format(data.shape[0],data.shape[1]))

The dataset contains 9984 rows and 3 columns after removing duplicates


In [13]:
data.head(10)

Unnamed: 0,Gender,Age Group,Content
0,female,30s,By deciding on a tidy organizational device yo...
1,female,20s,Each year we all make the same resolutions: to...
2,male,30s,For indoor utilizes these lights come in reduc...
3,female,30s,1. Look your best.<br />;If you're a Kurt Coba...
4,female,20s,For years tax debt resolution was about mislea...
5,female,30s,"<img class=""smiley"" src=""http://www.pan.net/sm..."
6,female,20s,"Its better a living Dog, than a dead Lion........"
7,female,30s,"<a href=""http://en.pan.netcom/go/out/url=-aHR0..."
8,female,20s,"For centuries, society has been slowly driftin..."
9,female,30s,Any homeowner that is currently dealing with a...


## Count of Ages and Gender

In [14]:
age_counts = data['Age Group'].value_counts()
gender_counts = data['Gender'].value_counts()
print("Age Counts:")
print(age_counts)
print("\nGender Counts:")
print(gender_counts)

Age Counts:
30s    5708
20s    3556
10s     720
Name: Age Group, dtype: int64

Gender Counts:
male      5049
female    4935
Name: Gender, dtype: int64


## PreProcessing

In [15]:
def preprocess_text(text):
    text = re.sub(r'<\s*br\s*/?\s*>', '', text) # html tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d', '', text)  # Remove numbers
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

In [16]:
print(data['Content'].apply(type).value_counts())

<class 'str'>    9984
Name: Content, dtype: int64


In [17]:
data=data[pd.notna(data['Content'])]

In [18]:
data['Processed Text'] = data['Content'].apply(preprocess_text)

In [19]:
data.head(10)

Unnamed: 0,Gender,Age Group,Content,Processed Text
0,female,30s,By deciding on a tidy organizational device yo...,by deciding on a tidy organizational device yo...
1,female,20s,Each year we all make the same resolutions: to...,each year we all make the same resolutions to ...
2,male,30s,For indoor utilizes these lights come in reduc...,for indoor utilizes these lights come in reduc...
3,female,30s,1. Look your best.<br />;If you're a Kurt Coba...,look your bestif youre a kurt cobain or johnny...
4,female,20s,For years tax debt resolution was about mislea...,for years tax debt resolution was about mislea...
5,female,30s,"<img class=""smiley"" src=""http://www.pan.net/sm...",img classsmiley srchttpwwwpannetsmiliessmilegi...
6,female,20s,"Its better a living Dog, than a dead Lion........",its better a living dog than a dead lion
7,female,30s,"<a href=""http://en.pan.netcom/go/out/url=-aHR0...",a hrefhttpenpannetcomgoouturlahrcdovlddytzwxyb...
8,female,20s,"For centuries, society has been slowly driftin...",for centuries society has been slowly drifting...
9,female,30s,Any homeowner that is currently dealing with a...,any homeowner that is currently dealing with a...


## String Indexer

In [20]:
column_to_index = 'Gender'
label_encoder = LabelEncoder()
data[column_to_index + '_indexed'] = label_encoder.fit_transform(data[column_to_index])
data.head(10)

Unnamed: 0,Gender,Age Group,Content,Processed Text,Gender_indexed
0,female,30s,By deciding on a tidy organizational device yo...,by deciding on a tidy organizational device yo...,0
1,female,20s,Each year we all make the same resolutions: to...,each year we all make the same resolutions to ...,0
2,male,30s,For indoor utilizes these lights come in reduc...,for indoor utilizes these lights come in reduc...,1
3,female,30s,1. Look your best.<br />;If you're a Kurt Coba...,look your bestif youre a kurt cobain or johnny...,0
4,female,20s,For years tax debt resolution was about mislea...,for years tax debt resolution was about mislea...,0
5,female,30s,"<img class=""smiley"" src=""http://www.pan.net/sm...",img classsmiley srchttpwwwpannetsmiliessmilegi...,0
6,female,20s,"Its better a living Dog, than a dead Lion........",its better a living dog than a dead lion,0
7,female,30s,"<a href=""http://en.pan.netcom/go/out/url=-aHR0...",a hrefhttpenpannetcomgoouturlahrcdovlddytzwxyb...,0
8,female,20s,"For centuries, society has been slowly driftin...",for centuries society has been slowly drifting...,0
9,female,30s,Any homeowner that is currently dealing with a...,any homeowner that is currently dealing with a...,0


## Data Splitting

In [21]:
X = data['Processed Text']
y = data['Gender_indexed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Feature Extraction
TF-IDF, Count Vectorizer

In [22]:
tfidf_vect = TfidfVectorizer()
Xtrain_tfidf = tfidf_vect.fit_transform(X_train)
Xtest_tfidf = tfidf_vect.transform(X_test)

count_vect = CountVectorizer()
Xtrain_count = count_vect.fit_transform(X_train)
Xtest_count = count_vect.transform(X_test)

## Models

In [23]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


### Logistic Rgression

In [24]:
logistic_regression_count = LogisticRegression(max_iter=100, random_state=42)
logistic_regression_count.fit(Xtrain_count, y_train)

predictions_lr = logistic_regression_count.predict(Xtest_count)
accuracy_lr= accuracy_score(y_test, predictions_lr)
precision_lr = precision_score(y_test, predictions_lr)
recall_lr = recall_score(y_test, predictions_lr)
f1_lr = f1_score(y_test, predictions_lr)
print("Logistic Regression Classifier  Accuracy :", "{:.2f}%".format(100*accuracy_lr))

Logistic Regression Classifier  Accuracy : 54.07%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### KNN

In [25]:
knn_tfidf = KNeighborsClassifier()
knn_tfidf.fit(Xtrain_tfidf, y_train)

predictions_knn= knn_tfidf.predict(Xtest_tfidf)
accuracy_knn= accuracy_score(y_test, predictions_knn)
precision_knn = precision_score(y_test, predictions_knn)
recall_knn = recall_score(y_test, predictions_knn)
f1_knn = f1_score(y_test, predictions_knn)
print("K-Nearest Neighbors Classifier  Accuracy :", "{:.2f}%".format(100*accuracy_knn))

K-Nearest Neighbors Classifier  Accuracy : 53.40%


### Naive Bayes

In [26]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(Xtrain_count, y_train)

predictions_nb = naive_bayes_classifier.predict(Xtest_count)
accuracy_nb = accuracy_score(y_test, predictions_nb)
precision_nb = precision_score(y_test, predictions_nb)
recall_nb = recall_score(y_test, predictions_nb)
f1_nb = f1_score(y_test, predictions_nb)
print("Naive Bayes Accuracy :", "{:.2f}%".format(100*accuracy_nb))

Naive Bayes Accuracy : 55.04%


### SVM

In [27]:
linear_svc = LinearSVC(penalty='l2',loss = 'hinge')
linear_svc.fit(Xtrain_tfidf,y_train)

predictions_svm=linear_svc.predict(Xtest_tfidf)
accuracy_svm=accuracy_score(y_test,predictions_svm)
precision_svm = precision_score(y_test, predictions_svm)
recall_svm = recall_score(y_test, predictions_svm)
f1_svm= f1_score(y_test, predictions_svm)
print("Linear Support Vector Classifier Accuracy :", "{:.2f}%".format(100*accuracy_svm))

Linear Support Vector Classifier Accuracy : 54.71%


### Random Forest

In [28]:
random_forest_tfidf = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_tfidf.fit(Xtrain_tfidf, y_train)

predictions_rf= random_forest_tfidf.predict(Xtest_tfidf)
accuracy_rf= accuracy_score(y_test, predictions_rf)
precision_rf = precision_score(y_test, predictions_rf)
recall_rf= recall_score(y_test, predictions_rf)
f1_rf= f1_score(y_test, predictions_rf)
print("Random Forest Classifier  Accuracy :", "{:.2f}%".format(100*accuracy_rf))

Random Forest Classifier  Accuracy : 55.11%


### Decision Tree Classifier

In [29]:
decision_tree_count = DecisionTreeClassifier()
decision_tree_count.fit(Xtrain_count, y_train)

predictions_dt = decision_tree_count.predict(Xtest_count)
accuracy_dt= accuracy_score(y_test, predictions_dt)
precision_dt = precision_score(y_test, predictions_dt)
recall_dt= recall_score(y_test, predictions_dt)
f1_dt= f1_score(y_test, predictions_dt)
print("KDecision Tree Classifier Accuracy :", "{:.2f}%".format(100*accuracy_dt))

KDecision Tree Classifier Accuracy : 52.84%


### Adaboost

In [30]:
base_classifier = DecisionTreeClassifier(max_depth=1)
adaboost_classifier = AdaBoostClassifier(base_classifier, n_estimators=50, random_state=42)
adaboost_classifier.fit(Xtrain_tfidf, y_train)

predictions_ada = adaboost_classifier.predict(Xtest_tfidf)
accuracy_ada = accuracy_score(y_test, predictions_ada)
precision_ada = precision_score(y_test, predictions_ada)
recall_ada = recall_score(y_test, predictions_ada)
f1_ada= f1_score(y_test, predictions_ada)
print("Adaboost Accuracy :", "{:.2f}%".format(100*accuracy_ada))

Adaboost Accuracy : 52.74%


## Comparisions

In [31]:
comparision_table = pd.DataFrame({
    'Model': ['Logistic Regression', 'K-neighbours','Naive Bayes','SVM','Random Forest','Decision Tree','Ada Boost' ],
    'Accuracy': [accuracy_lr,accuracy_knn,accuracy_nb ,accuracy_svm, accuracy_rf,accuracy_dt,accuracy_ada],
    'Precision':[precision_lr,precision_knn,precision_nb ,precision_svm,precision_rf,precision_dt,precision_ada],
        'Recall':[recall_lr,recall_knn,recall_nb ,recall_svm,recall_rf,recall_dt,recall_ada],
            'F1':[f1_lr,f1_knn,f1_nb ,f1_svm,f1_rf,f1_dt,f1_ada]
})

comparision_table

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.540721,0.548387,0.567037,0.557556
1,K-neighbours,0.534045,0.53672,0.63571,0.582036
2,Naive Bayes,0.550401,0.578856,0.436887,0.49795
3,SVM,0.547063,0.563609,0.498365,0.528983
4,Random Forest,0.551068,0.573955,0.466972,0.514966
5,Decision Tree,0.528371,0.537323,0.546109,0.54168
6,Ada Boost,0.52737,0.532194,0.610857,0.568819
