
### **Importing Libraries**

In [2]:
import os
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from sklearn.impute import SimpleImputer
from bs4 import BeautifulSoup
import re,string,unicodedata
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix






## Loding Dataset

In [3]:
data=pd.read_csv('author.csv',nrows=10000)

In [4]:
data.head(10)

Unnamed: 0,Gender,Age Group,Content
0,female,30s,By deciding on a tidy organizational device yo...
1,female,20s,Each year we all make the same resolutions: to...
2,male,30s,For indoor utilizes these lights come in reduc...
3,female,30s,1. Look your best.<br />;If you're a Kurt Coba...
4,female,20s,For years tax debt resolution was about mislea...
5,female,30s,"<img class=""smiley"" src=""http://www.pan.net/sm..."
6,female,20s,"Its better a living Dog, than a dead Lion........"
7,female,30s,"<a href=""http://en.pan.netcom/go/out/url=-aHR0..."
8,female,20s,"For centuries, society has been slowly driftin..."
9,female,30s,Any homeowner that is currently dealing with a...


### Drop rows with missing Values in content

In [5]:
missing_values = data['Content'].isnull().sum()
missing_values

0

In [6]:
data = data.dropna(subset=['Content'])

### Null values in the dataset

In [7]:
null_values = data.isnull().sum()

In [8]:
print('There are {} missing values for {} and {} missing values for {} {} {}.'.format(null_values[0],null_values.index[0],null_values[1],null_values.index[1],null_values[2],null_values.index[2]))

There are 0 missing values for Gender and 0 missing values for Age Group 0 Content.


### Duplicate Rows

In [9]:
num_duplicates = data.duplicated().sum()
print('There are {} duplicatetext present in the dataset'.format(num_duplicates))

There are 16 duplicatetext present in the dataset


In [10]:
data.drop_duplicates(inplace = True)

### Remaining Rows

In [11]:
print('The dataset contains {} rows and {} columns after removing duplicates'.format(data.shape[0],data.shape[1]))

The dataset contains 9984 rows and 3 columns after removing duplicates


In [12]:
data.head(10)

Unnamed: 0,Gender,Age Group,Content
0,female,30s,By deciding on a tidy organizational device yo...
1,female,20s,Each year we all make the same resolutions: to...
2,male,30s,For indoor utilizes these lights come in reduc...
3,female,30s,1. Look your best.<br />;If you're a Kurt Coba...
4,female,20s,For years tax debt resolution was about mislea...
5,female,30s,"<img class=""smiley"" src=""http://www.pan.net/sm..."
6,female,20s,"Its better a living Dog, than a dead Lion........"
7,female,30s,"<a href=""http://en.pan.netcom/go/out/url=-aHR0..."
8,female,20s,"For centuries, society has been slowly driftin..."
9,female,30s,Any homeowner that is currently dealing with a...


## Count of Ages and Gender

In [13]:
age_counts = data['Age Group'].value_counts()
gender_counts = data['Gender'].value_counts()
print("Age Counts:")
print(age_counts)
print("\nGender Counts:")
print(gender_counts)

Age Counts:
30s    5708
20s    3556
10s     720
Name: Age Group, dtype: int64

Gender Counts:
male      5049
female    4935
Name: Gender, dtype: int64


## PreProcessing

In [14]:
def preprocess_text(text):
    text = re.sub(r'<\s*br\s*/?\s*>', ' ', text) # html tags
    text = re.sub(r'<.*?>',' ',text)
    text = re.sub(r'\d', ' ', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

In [15]:
print(data['Content'].apply(type).value_counts())

<class 'str'>    9984
Name: Content, dtype: int64


In [16]:
data=data[pd.notna(data['Content'])]

In [17]:
data['Processed Text'] = data['Content'].apply(preprocess_text)

In [18]:
data.head(10)

Unnamed: 0,Gender,Age Group,Content,Processed Text
0,female,30s,By deciding on a tidy organizational device yo...,By deciding on a tidy organizational device yo...
1,female,20s,Each year we all make the same resolutions: to...,Each year we all make the same resolutions: to...
2,male,30s,For indoor utilizes these lights come in reduc...,For indoor utilizes these lights come in reduc...
3,female,30s,1. Look your best.<br />;If you're a Kurt Coba...,. Look your best. ;If you're a Kurt Cobain or ...
4,female,20s,For years tax debt resolution was about mislea...,For years tax debt resolution was about mislea...
5,female,30s,"<img class=""smiley"" src=""http://www.pan.net/sm...",; today is a wonderful day. I just spoke to a ...
6,female,20s,"Its better a living Dog, than a dead Lion........","Its better a living Dog, than a dead Lion........"
7,female,30s,"<a href=""http://en.pan.netcom/go/out/url=-aHR0...",retirement villages tauranga I have a dilemma....
8,female,20s,"For centuries, society has been slowly driftin...","For centuries, society has been slowly driftin..."
9,female,30s,Any homeowner that is currently dealing with a...,Any homeowner that is currently dealing with a...


In [20]:
# Count commas and add a new column
data['comma'] = data['Processed Text'].apply(lambda x: x.count(','))

# Count hashtags and add a new colu

data['hashtag'] = data['Processed Text'].apply(lambda x: x.count('#'))

# Count question marks and add a new column
data['question'] = data['Processed Text'].apply(lambda x: x.count('?'))

# Count ; and add a new column
data['colon'] = data['Processed Text'].apply(lambda x: x.count(';'))

# Count : and add a new column
data['semicolon'] = data['Processed Text'].apply(lambda x: x.count(':'))

# Count . and add a new column
data['fullstop'] = data['Processed Text'].apply(lambda x: x.count('.'))

#count capital letters
data['uppercase'] = data['Processed Text'].apply(lambda x: sum(1 for c in x if c.isupper()))

In [21]:
# Display the updated DataFrame
data

Unnamed: 0,Gender,Age Group,Content,Processed Text,comma,hashtag,question,colon,semicolon,fullstop,uppercase
0,female,30s,By deciding on a tidy organizational device yo...,By deciding on a tidy organizational device yo...,12,0,0,0,0,28,29
1,female,20s,Each year we all make the same resolutions: to...,Each year we all make the same resolutions: to...,15,0,5,19,2,21,64
2,male,30s,For indoor utilizes these lights come in reduc...,For indoor utilizes these lights come in reduc...,23,0,0,24,1,30,41
3,female,30s,1. Look your best.<br />;If you're a Kurt Coba...,. Look your best. ;If you're a Kurt Cobain or ...,27,0,0,19,2,58,58
4,female,20s,For years tax debt resolution was about mislea...,For years tax debt resolution was about mislea...,25,0,1,1,2,35,58
...,...,...,...,...,...,...,...,...,...,...,...
9995,male,20s,unfortunate<br />;For your past three total mo...,"unfortunate ;For your past three total months,...",43,1,0,24,1,0,78
9996,male,30s,Materials like Artex asbestos necessitate you ...,Materials like Artex asbestos necessitate you ...,27,0,0,23,1,27,41
9997,female,20s,This is actually the tip of the curing approac...,This is actually the tip of the curing approac...,20,0,1,102,4,24,83
9998,female,30s,"On Feb 7 2011,??.I won a 32 inch Samsung TV.It...","On Feb ,??.I won a inch Samsung TV.It's that a...",1,0,2,1,0,3,8


## String Indexer

In [22]:
column_to_index = 'Age Group'
label_encoder = LabelEncoder()
data[column_to_index + '_indexed'] = label_encoder.fit_transform(data[column_to_index])
data.head(5)

Unnamed: 0,Gender,Age Group,Content,Processed Text,comma,hashtag,question,colon,semicolon,fullstop,uppercase,Age Group_indexed
0,female,30s,By deciding on a tidy organizational device yo...,By deciding on a tidy organizational device yo...,12,0,0,0,0,28,29,2
1,female,20s,Each year we all make the same resolutions: to...,Each year we all make the same resolutions: to...,15,0,5,19,2,21,64,1
2,male,30s,For indoor utilizes these lights come in reduc...,For indoor utilizes these lights come in reduc...,23,0,0,24,1,30,41,2
3,female,30s,1. Look your best.<br />;If you're a Kurt Coba...,. Look your best. ;If you're a Kurt Cobain or ...,27,0,0,19,2,58,58,2
4,female,20s,For years tax debt resolution was about mislea...,For years tax debt resolution was about mislea...,25,0,1,1,2,35,58,1


In [23]:
# Replace NaN values with an empty string
data['Processed Text'] = data['Processed Text'].fillna('')

In [24]:
data.head(5)

Unnamed: 0,Gender,Age Group,Content,Processed Text,comma,hashtag,question,colon,semicolon,fullstop,uppercase,Age Group_indexed
0,female,30s,By deciding on a tidy organizational device yo...,By deciding on a tidy organizational device yo...,12,0,0,0,0,28,29,2
1,female,20s,Each year we all make the same resolutions: to...,Each year we all make the same resolutions: to...,15,0,5,19,2,21,64,1
2,male,30s,For indoor utilizes these lights come in reduc...,For indoor utilizes these lights come in reduc...,23,0,0,24,1,30,41,2
3,female,30s,1. Look your best.<br />;If you're a Kurt Coba...,. Look your best. ;If you're a Kurt Cobain or ...,27,0,0,19,2,58,58,2
4,female,20s,For years tax debt resolution was about mislea...,For years tax debt resolution was about mislea...,25,0,1,1,2,35,58,1


## Feature Extraction
TF-IDF, Count Vectorizer

In [29]:
# Assuming 'text_column' is the name of the column you want to process
column_name = 'Processed Text'

# Extract the specified column from X_train
feature_column = data[column_name]

# TF-IDF Vectorizer
tfidf_vect = TfidfVectorizer(max_features=1000)  # Adjust the value based on your needs
X_train_tfidf = tfidf_vect.fit_transform(feature_column)

# Count Vectorizer
count_vect = CountVectorizer(max_features=1000)
X_train_count = count_vect.fit_transform(feature_column)

# Convert the TF-IDF matrix to a dense array
dense_tfidf_array = X_train_tfidf.toarray()

# Convert the count matrix to a dense array
dense_count_array = X_train_count.toarray()

# Create a new column in the original DataFrame for the TF-IDF vectors
data['TFIDF_Vector'] = dense_tfidf_array.tolist()
data['Count_Vector'] = dense_count_array.tolist()

# Display the result
data

Unnamed: 0,Gender,Age Group,Content,Processed Text,comma,hashtag,question,colon,semicolon,fullstop,uppercase,Age Group_indexed,TFIDF_Vector,Count_Vector
0,female,30s,By deciding on a tidy organizational device yo...,By deciding on a tidy organizational device yo...,12,0,0,0,0,28,29,2,"[0.0, 0.0, 0.04128582597310149, 0.0, 0.0, 0.0,...","[0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
1,female,20s,Each year we all make the same resolutions: to...,Each year we all make the same resolutions: to...,15,0,5,19,2,21,64,1,"[0.0, 0.046827597648307996, 0.0979681016362920...","[0, 2, 6, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
2,male,30s,For indoor utilizes these lights come in reduc...,For indoor utilizes these lights come in reduc...,23,0,0,24,1,30,41,2,"[0.0, 0.0, 0.06669870692841441, 0.0, 0.0, 0.0,...","[0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,female,30s,1. Look your best.<br />;If you're a Kurt Coba...,. Look your best. ;If you're a Kurt Cobain or ...,27,0,0,19,2,58,58,2,"[0.044461505762619036, 0.05569623958798594, 0....","[1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,female,20s,For years tax debt resolution was about mislea...,For years tax debt resolution was about mislea...,25,0,1,1,2,35,58,1,"[0.0, 0.0, 0.04006506808607367, 0.0, 0.0, 0.09...","[0, 0, 2, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,male,20s,unfortunate<br />;For your past three total mo...,"unfortunate ;For your past three total months,...",43,1,0,24,1,0,78,1,"[0.0, 0.03970863843543112, 0.02769150473543209...","[0, 2, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
9996,male,30s,Materials like Artex asbestos necessitate you ...,Materials like Artex asbestos necessitate you ...,27,0,0,23,1,27,41,2,"[0.0, 0.0, 0.04039767848620995, 0.0, 0.0, 0.0,...","[0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, ..."
9997,female,20s,This is actually the tip of the curing approac...,This is actually the tip of the curing approac...,20,0,1,102,4,24,83,1,"[0.0, 0.0, 0.0672520324016974, 0.0, 0.0, 0.0, ...","[0, 0, 3, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, ..."
9998,female,30s,"On Feb 7 2011,??.I won a 32 inch Samsung TV.It...","On Feb ,??.I won a inch Samsung TV.It's that a...",1,0,2,1,0,3,8,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## Data Splitting

In [39]:
# Assuming 'Processed Text', 'comma', 'hashtag', 'question', 'colon', 'semicolon', 'fullstop', 'uppercase' are column names
selected_columns = ['comma', 'hashtag', 'question', 'colon', 'semicolon', 'fullstop', 'uppercase','TFIDF_Vector','Count_Vector']

X = data[selected_columns]
y = data['Age Group_indexed']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [40]:
X_train

Unnamed: 0,comma,hashtag,question,colon,semicolon,fullstop,uppercase,TFIDF_Vector,Count_Vector
1330,32,0,2,22,2,29,53,"[0.0, 0.0, 0.01841923829945282, 0.0, 0.0, 0.0,...","[0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5649,21,0,2,1,5,24,35,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
9643,16,0,4,25,0,29,42,"[0.0, 0.029342929057023924, 0.0409255965930579...","[0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1779,17,0,1,1,1,23,37,"[0.0, 0.03060184311240365, 0.08536289501217949...","[0, 1, 4, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, ..."
6076,0,0,0,0,1,2,7,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...
5744,19,0,1,14,0,35,61,"[0.0, 0.0, 0.0816905894358756, 0.0, 0.0, 0.0, ...","[0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5201,17,0,0,4,0,5,11,"[0.0, 0.0, 0.03563497039011296, 0.0, 0.0, 0.0,...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5400,0,0,1,274,0,18,97,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
861,0,0,1,0,0,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## Models

In [41]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [42]:
print(X_train.shape, y_train.shape)


(6988, 9) (6988,)


knn_tfidf = KNeighborsClassifier()
knn_tfidf.fit(Xtrain_tfidf, y_train)

predictions_knn= knn_tfidf.predict(Xtest_tfidf)
accuracy_knn= accuracy_score(y_test, predictions_knn)
precision_knn = precision_score(y_test, predictions_knn,average='weighted')
recall_knn = recall_score(y_test, predictions_knn,average='weighted')
f1_knn_10 = f1_score(y_test=0, predictions_knn=0,average='weighted')
f1_knn_20 = f1_score(y_test=1, predictions_knn=1,average='weighted')
f1_knn_30 = f1_score(y_test=2, predictions_knn=2,average='weighted')
print("10s data: ",f1_knn_10*100)
print("20s data: ",f1_knn_20*100)
print("30s data: ",f1_knn_30*100)
print("K-Nearest Neighbors Classifier  Accuracy :", "{:.2f}%".format(100*accuracy_knn))

### Adaboost

In [43]:
base_classifier = DecisionTreeClassifier(max_depth=1)
adaboost_classifier = AdaBoostClassifier(base_classifier, n_estimators=50, random_state=42)
adaboost_classifier.fit(X_train, y_train)

predictions_ada = adaboost_classifier.predict(X_test)
accuracy_ada = accuracy_score(y_test, predictions_ada)
precision_ada = precision_score(y_test, predictions_ada,average='weighted')
recall_ada = recall_score(y_test, predictions_ada,average='weighted')
#f1_ada= f1_score(y_test, predictions_ada,average='weighted')

f1_ada_0 = f1_score(y_test==0, predictions_ada==0)
f1_ada_1 = f1_score(y_test==1, predictions_ada==1)
f1_ada_2 = f1_score(y_test==2, predictions_ada==2)
print(" Adaboost Classifier  Accuracy :", "{:.2f}%".format(100*accuracy_ada))
print("10s data: ",f1_ada_0*100)
print("20s data: ",f1_ada_1*100)
print("30s data: ",f1_ada_2*100)

print("Adaboost Accuracy :", "{:.2f}%".format(100*f1_ada))

ValueError: setting an array element with a sequence.