In [12]:
# In this project, we are going to classify the topics based on the blog's text. 
# Dataset # Dataset: https://www.kaggle.com/rtatman/blog-authorship-corpus
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# tfidf for feature extraction from text
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# import ML models 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC

#  Metrices for model evalaution 
from sklearn import metrics
from sklearn.metrics import classification_report , accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score

In [13]:
df = pd.read_csv(r'dataset/blogtext/cleaned_blogtext.csv')

In [14]:
df.head()

Unnamed: 0,text,topic
0,ive watching lot movie lately month may ive wa...,Student
1,pop game web game ever played happens clear le...,Student
2,step yeah come school monday mean bunch god th...,Student
3,ok havent posted sorry bout anyways lang ako,Student
4,hi may ill town weekend leaving thursday after...,Student


In [15]:
df.isnull().sum().tolist()

[1225, 0]

In [16]:
df = df.dropna(how="any")

In [17]:
# Calculate  freq of each word, just to have a look.
# however, we will not use it for modeling
cv = CountVectorizer()
data_cv = cv.fit_transform( df.text )

In [18]:
print(type(data_cv))
print(data_cv.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(38626, 1637)


In [19]:
print(data_cv[10])

  (0, 553)	1
  (0, 110)	1
  (0, 1322)	1
  (0, 688)	1
  (0, 375)	1
  (0, 1624)	1
  (0, 1501)	1
  (0, 699)	1
  (0, 201)	1
  (0, 910)	1
  (0, 899)	1
  (0, 1539)	1
  (0, 1258)	1
  (0, 16)	1
  (0, 840)	5
  (0, 1357)	1
  (0, 949)	1
  (0, 1455)	1
  (0, 1535)	1
  (0, 1588)	1
  (0, 846)	1
  (0, 1327)	1
  (0, 345)	3
  (0, 1594)	1
  (0, 584)	1
  (0, 866)	1
  (0, 1109)	1
  (0, 383)	4
  (0, 651)	1
  (0, 1432)	1
  (0, 179)	1
  (0, 570)	1
  (0, 471)	1
  (0, 1572)	2
  (0, 1601)	1
  (0, 1544)	1
  (0, 1268)	1
  (0, 1553)	2
  (0, 1554)	1


In [20]:
tfidf = TfidfVectorizer()
features = tfidf.fit_transform( df.text )

In [21]:
print(type(features))
print( features.shape )

<class 'scipy.sparse.csr.csr_matrix'>
(38626, 1637)


In [22]:
features_df = pd.DataFrame(features.toarray(), columns=cv.get_feature_names())
features_df.index = df.index

In [23]:
features_df.head(5)

Unnamed: 0,ability,able,abortion,absolutely,accept,access,according,account,across,act,...,yes,yesterday,yet,york,youd,youll,young,youre,youth,youve
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058616,...,0.0,0.0,0.045246,0.0,0.0,0.124555,0.0,0.048036,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# Convert our target -- topic (categorical) into numeric format
df['topic'] = df['topic'].factorize()[0]
df['topic'].value_counts()


6    4886
3    4876
4    4863
0    4850
1    4847
2    4845
5    4778
7    4681
Name: topic, dtype: int64

In [25]:
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

# define cross validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []

for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, df.topic, scoring='accuracy', cv=CV)
    
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))

cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
cv_df.groupby('model_name').accuracy.mean()




model_name
LinearSVC                 0.316311
LogisticRegression        0.319548
MultinomialNB             0.307225
RandomForestClassifier    0.255654
Name: accuracy, dtype: float64

In [26]:
# Accuracy is coming belowe than 30%. Something's not good. Need to investigate further