## 05 Text Classification with LogisticRegression

In [103]:
import pandas as pd
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

## Reading Data

In [102]:
df = pd.read_csv('../data/merged_plots.csv', index_col=0)
df

Unnamed: 0,title,plot,is_indian
0,10.0 Earthquake,As a series of minor earthquakes start tearing...,0
1,12 Rounds (film),A sting operation to capture arms dealer Miles...,0
2,12 Rounds 3: Lockdown,Detective Tyler Burke and his two men infiltr...,0
3,200 mph,When the older brother (Tommy Nash) he idolize...,0
4,Ablaze (2001 film),Andrew Thomas is an agent tasked with recordin...,0
...,...,...,...
3995,Choked (film),Sarita Pillai and Sushant Pillai are a married...,1
3996,Chumbak,"Chumbak is a coming-of-age story of Baalu, a t...",1
3997,Chungakkarum Veshyakalum,Chungakkarum Veshyakalum is the story of a Mal...,1
3998,Chuzhi,Varghese is a planter who lives with his wife ...,1


## Pre Processing

In [25]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

porter_stemmer=PorterStemmer()
stop_words = set(stopwords.words('english'))
non_name_words = set(nltk.corpus.words.words())

def clean(text, ner=False):

    # lowercasing
    text = text.lower()

    # removing special characters
    text = re.sub('\\W', ' ', text)

    # splitting into tokens
    words = text.split()

    # removing stopwords
    filtered_words = []
    for word in words:
        if not word in stop_words:
            filtered_words.append(word)
    
    # word stemming
    stemmed_words = [porter_stemmer.stem(word) for word in filtered_words]
    
    # removing names
    if ner:
        words_with_ner = []
        for word in stemmed_words:
            if word in non_name_words:
                words_with_ner.append(word)
        result = words_with_ner
    else:
        result = stemmed_words
        
    return ' '.join(result)

### Without Named Entity Removal

In [100]:
# NamedEntityRemoval = False
plots_cleaned = [clean(plot, ner=False) for plot in df['plot']]
print(plots_cleaned[0])

seri minor earthquak start tear apart lo angel scientist emili usg theoriz build super quak drop entir citi lava fill chasm engin jack whose daughter gone camp friend danger area whose compani respons quak due deep frack feel oblig help race emili increasingli damag citi hope divert epicent long beach potenti save million live citi lo angel


### With Named Entity Removal

In [54]:
# NamedEntityRemoval = True
plots_cleaned_ner = [clean(plot, ner=True) for plot in df['plot']]
print(plots_cleaned_ner[0])

minor start tear apart lo angel scientist build super drop lava fill chasm jack whose daughter gone camp friend danger area whose due deep frack feel help race hope divert long beach save million live lo angel


## Vectorization
Scikit-learn’s CountVectorizer is used to convert a collection of text documents to a vector of token counts.

In [98]:
vectorizer = CountVectorizer(min_df=0)
vectorizer.fit(plots_cleaned)
print('Vocab length WITHOUT NER:', len(vectorizer.vocabulary_))

Vocab length WITHOUT NER: 32340


In [101]:
vectorizer_ner = CountVectorizer(min_df=0)
vectorizer_ner.fit(plots_cleaned_ner)
print('Vocab length WITH NER:', len(vectorizer_ner.vocabulary_))

Vocab length WITH NER: 8867


## Data Splitting

In [81]:
y = df['is_indian'].values

In [83]:
X = vectorizer.transform(plots_cleaned)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state= 1000)

In [84]:
X_ner = vectorizer_ner.transform(plots_cleaned_ner)

X_train_ner, X_test_ner, y_train, y_test = train_test_split(X_ner, y, test_size=0.25, random_state= 1000)

## Logistic Regression

In [90]:
reg = LogisticRegression(solver='liblinear', random_state=42)
reg = reg.fit(X_train, y_train)

In [91]:
reg_ner = LogisticRegression(solver='liblinear', random_state=42)
reg_ner = reg_ner.fit(X_train_ner, y_train)

In [95]:
predictions = reg.predict(X_test)
score = accuracy_score(y_test, predictions)

print(f'Accuracy sore WITHOUT ner: {score}')

Accuracy sore WITHOUT ner: 0.938


In [94]:
predictions_ner = reg_ner.predict(X_test_ner)
score_ner = accuracy_score(y_test, predictions_ner)

print(f'Accuracy sore WITH ner: {score_ner}')

Accuracy sore WITH ner: 0.916
