<a href="https://colab.research.google.com/github/Amar-Pratap-Singh/ML-Project-1/blob/main/ML_Project1_MachineLearners.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.classify import NaiveBayesClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

### Reading dataset

In [None]:
df = pd.read_csv("../input/yahoo-troll-question-detection/train_df.csv")

# PREPROCESSING

Finding duplicates

In [None]:
len(df[df.duplicated()])

0

## 1. Removing Punctuations

In [None]:
import string
punctuations = list(string.punctuation)

def remove_punctuation(text, punctuations):
    for punctuation in punctuations:
        if punctuation in text:
            text = text.replace(punctuation, '')
    return text.strip()


df['question_text_pre_processed'] = df['question_text'].apply(lambda text: remove_punctuation(text, punctuations))

## 2. Lowering the Text

In [None]:
df['question_text_pre_processed'] = df['question_text_pre_processed'].apply(lambda text: text.lower())


## 3. Removing the STOP WORDS

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

df['question_text_pre_processed'] = df['question_text_pre_processed'].apply(lambda text: ' '.join(word for word in text.split() if word not in stopwords))


## 4. Tokenization 
using \
`nltk.tokenize word_tokenize function`

In [None]:
from nltk.tokenize import word_tokenize

df['question_text_pre_processed'] = df['question_text_pre_processed'].apply(lambda text: word_tokenize(text))


In [None]:
df['question_text_pre_processed'].head()

0             [interesting, facts, microsoft, history]
1                      [things, gon, na, happen, ever]
2    [know, avoid, upsold, getting, car, brakes, ch...
3                        [add, account, payment, bank]
4    [multi, level, marketing, products, actually, ...
Name: question_text_pre_processed, dtype: object

## 5. Stemming 
using \
`nltk.stem PorterStemmer stem function`

In [None]:
ps = PorterStemmer()

df['question_text_pre_processed_root'] = df['question_text_pre_processed'].apply(lambda text: [ps.stem(word) for word in text])


In [None]:
df['question_text_pre_processed_root'].head()

0                 [interest, fact, microsoft, histori]
1                       [thing, gon, na, happen, ever]
2        [know, avoid, upsold, get, car, brake, chang]
3                        [add, account, payment, bank]
4    [multi, level, market, product, actual, worth,...
Name: question_text_pre_processed_root, dtype: object

## 6. Lemmatization 
using\
`nltk.stem WordNetLemmatizer lemmatize function`

In [None]:
# word_lemmatize = WordNetLemmatizer()

# def lemmatizer(text):
#     lemm_text = [word_lemmatize.lemmatize(word) for word in text]
#     return lemm_text

# df['question_text_pre_processed_base'] = df['question_text_pre_processed_root'].apply(lambda text: [lemmatizer(text)])

In [None]:
# df['question_text_pre_processed_base'][1]

# GENERATING MODELS

## Feature Extraction

In [None]:
X = df["question_text_pre_processed_root"]

In [None]:
words = []

X = X[: int(0.9*len(X))]

In [None]:
len(X)

900000

In [None]:
for text in X:
    for word in text: 
        words.append(word)

In [None]:
words[:20]

['interest',
 'fact',
 'microsoft',
 'histori',
 'thing',
 'gon',
 'na',
 'happen',
 'ever',
 'know',
 'avoid',
 'upsold',
 'get',
 'car',
 'brake',
 'chang',
 'add',
 'account',
 'payment',
 'bank']

##### Counting frequency of each word in the dataset

In [None]:
words = nltk.FreqDist(words)

In [None]:
## To print values of frequency and sorting them

# words = pd.DataFrame(list(words.items()), columns = ["Words","Frequency"])
# df3.sort_values('Frequency', ascending=False)

We are selecting all the words in feature word

In [None]:
feature_words = words

In [None]:
# Convert all the splitted words into string datatype

X = X.apply(lambda text: [str(word) for word in text])

In [None]:
# Creating a dictionary of (word, word in feature_words) key-value pair for each word in each question_text
# Returns array of dictionary containing words as keys if they are present in feature_words

def find_features(text):
    features = {}
    for word in text:
        features[word] = True
    
    return features

In [None]:
feature_set = [find_features(text) for text in X]

In [None]:
feature_set

[{'interest': True, 'fact': True, 'microsoft': True, 'histori': True},
 {'thing': True, 'gon': True, 'na': True, 'happen': True, 'ever': True},
 {'know': True,
  'avoid': True,
  'upsold': True,
  'get': True,
  'car': True,
  'brake': True,
  'chang': True},
 {'add': True, 'account': True, 'payment': True, 'bank': True},
 {'multi': True,
  'level': True,
  'market': True,
  'product': True,
  'actual': True,
  'worth': True,
  'purchas': True},
 {'scope': True,
  'would': True,
  'recommend': True,
  'remington': True,
  'model': True,
  '700': True,
  '270': True,
  'shoot': True,
  '5600': True,
  'yard': True},
 {'black': True, 'peopl': True, 'sustain': True, 'civil': True},
 {'citi': True, 'better': True, 'de': True, 'moin': True, 'omaha': True},
 {'thicken': True, 'stir': True, 'fri': True, 'sauc': True},
 {'woman': True, 'squirt': True, 'rub': True, 'clit': True, 'gentli': True},
 {'us': True,
  'annual': True,
  'parad': True,
  'icbm': True,
  'street': True,
  'washington': T

#### Dividing given data into training and testing data

In [None]:
question_target = []

for index, row in df.head(900000).iterrows():
    question_target.append([feature_set[index], row['target']])


In [None]:
training_data = question_target[: int((len(question_target))*0.9)]
testing_data = question_target[int((len(question_target))*0.9):]

## MODELS

## 1. Naive Bayes 

In [None]:
naive_bayes_model = NaiveBayesClassifier.train(training_data)

In [None]:
accuracy = nltk.classify.accuracy(naive_bayes_model, testing_data)

In [None]:
print("Using Naive Bayes\nAccuracy: ", accuracy*100, "%")

Using Naive Bayes
Accuracy:  83.92333333333333 %


## 2. Multinomial Naive Bayes

In [None]:
multi_naive_bayes_model = SklearnClassifier(MultinomialNB())

In [None]:
multi_naive_bayes_model.train(training_data)

<SklearnClassifier(MultinomialNB())>

In [None]:
accuracy = nltk.classify.accuracy(multi_naive_bayes_model, testing_data)

In [None]:
print("Using Multinomial Naive Bayes\nAccuracy: ", accuracy*100, "%")

Using Multinomial Naive Bayes
Accuracy:  93.98555555555555 %


## 3. Logistic Regression

In [None]:
logistic_regression_model = SklearnClassifier(LogisticRegression())

In [None]:
logistic_regression_model.train(training_data)

In [None]:
accuracy = nltk.classify.accuracy(logistic_regression_model, testing_data)

In [None]:
print("Using Logistic Regression\nAccuracy: ", accuracy*100, "%")

Using Logistic Regression
Accuracy:  95.08222222222223 %


## Bag of words

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC
# from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# from sklearn.pipeline import Pipeline
# from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(df["question_text_pre_processed"])

In [None]:
# X.shape

## Pipeline and Random Forest

In [None]:
# X.shape

In [None]:
# clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', RandomForestClassifier(n_estimators=100, n_jobs=-1))])

In [None]:
# clf.fit(df['question_text_pre_processed'], df['target'])

In [None]:
# df2 = pd.read_csv("../input/yahoo-troll-question-detection/test_df.csv")

# Y_pred = clf.predict(df2["question_text"])