# Lec 6 Text Classification/Feature Extraction


In [None]:
import numpy as np 
import pandas as pd

In [None]:
temp_df = pd.read_csv("IMDB Dataset.csv")

## Select 10K rows out of 50K

In [None]:
df = temp_df.iloc[:10000]

In [None]:
df.head()

In [None]:
df['review'][1]

## Check Data Distribution/Normality of Data

In [None]:
df['sentiment'].value_counts()

## Check Missing Values in Data 

In [None]:
df.isnull().sum()

## Check Duplicate Values in Data

In [None]:
df.duplicated().sum()

## Remove Duplicate Values in Data

In [None]:
df.drop_duplicates(inplace=True)

# Basic Pre-processing

## 1.Remove Tags 

In [None]:
import re
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean,'',text)
df['review'] = df['review'].apply(remove_html_tags)

## 2.Lowercasing 

In [None]:
df['review'] = df['review'].apply(lambda review: review.lower())

## 3.Remove Stopwords 

In [None]:
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english')
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stopwords_list])
df['review'] = df['review'].apply(remove_stopwords)

## Now transformation is completed and data is ready for ML 

# 1.Create X and Y 

In [None]:
X = df.iloc[:,0:1]
y = df['sentiment']

# 2.Import LabelEncoder to Encode Categorical Data to Integer Data

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y) 

# 3.Import Train_Test_Split

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=1,test_size=0.2)

In [None]:
X_train.shape

# 4.Import and Apply BOW 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
model = CountVectorizer()
# Train the model
X_train_bow = model.fit_transform(X_train['review']).toarray()
X_test_bow = model.fit_transform(X_test['review']).toarray()

In [None]:
X_train_bow.shape # check no. of features after applying bag of words

# 4.Import and Apply NGram 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
model = CountVectorizer(ngram_range=(1,2)) # You can use ngram rather than bows , it will make bigram 
# Train the model
X_train_bow = model.fit_transform(X_train['review']).toarray()
X_test_bow = model.fit_transform(X_test['review']).toarray()

## Select Max-Feature Manually 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
model = CountVectorizer(max_features=3000) # for high accuracy ,reduce feature size of bag of words 
# Train the model
X_train_bow = model.fit_transform(X_train['review']).toarray()
X_test_bow = model.fit_transform(X_test['review']).toarray()

# 5.Import and Apply Gaussian NB

In [None]:
from sklearn.naive_bayes import GaussianNB
model_gnb = GaussianNB()
# Train the model
model_gnb.fit(X_train_bow,y_train)

In [None]:
y_pred = model_gnb.predict(X_test_bow) # Predict Unknown 

# 6.Check Accuracy of Model 

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score(y_test,y_pred)
# hyper-parameter tuning can much increase accuracy level 

In [None]:
confusion_matrix(y_test,y_pred)

# By Random Forest Classifier Algorithm

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
# Train the model
rf.fit(X_train_bow,y_train)

In [None]:
y_pred = rf.predict(X_test_bow) # Predict Unknown 
accuracy_score(y_test,y_pred)

# 4.Using TF-IDF 
## It is used to make information-retreival systems

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
Tfidf = TfidfVectorizer()

# Train the model
X_train_tfidf = Tfidf.fit_transform(X_train['review']).toarray()
X_test_tfidf = Tfidf.fit(X_test['review']).toarray()

rf = RandomForestClassifier()
# Train the model
rf.fit(X_train_tfidf,y_train)
# Predict Unknown 
y_pred = rf.pred(X_test_tfidf)
accuracy_score(X_test,y_pred)

AttributeError: 'TfidfVectorizer' object has no attribute 'toarray'