<a href="https://colab.research.google.com/github/Delta9529/Mini-Bootcamp-Python-Pandas-Matplotlib-NLP/blob/main/NLP_Graded_Exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Problem Overview

In this challenge, you will work on a dataset that contains news headlines - which are aimed to be written in a sarcastic manner by the news author. Our job here is to build our NLP models and predict whether the headline is sarcastic or not.

### About the Data

Each record of dataset consists of two attributes:

is_sarcastic: 1 if the record is sarcastic otherwise 0. This is the target variable.

headline: this is the headline of the news article

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Dphi/Mini Bootcamps/Train_Data.csv')
df.head()

Unnamed: 0,headline,is_sarcastic
0,supreme court votes 7-2 to legalize all worldl...,1
1,hungover man horrified to learn he made dozens...,1
2,emily's list founder: women are the 'problem s...,0
3,send your kids back to school with confidence,0
4,watch: experts talk pesticides and health,0


In [None]:
df['len'] = df['headline'].apply(lambda x: len(x.split(" ")))
df.head()

Unnamed: 0,headline,is_sarcastic,len
0,supreme court votes 7-2 to legalize all worldl...,1,9
1,hungover man horrified to learn he made dozens...,1,12
2,emily's list founder: women are the 'problem s...,0,10
3,send your kids back to school with confidence,0,8
4,watch: experts talk pesticides and health,0,6


In [None]:
df.shape

(44262, 3)

## Text Pre-processing

In [None]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[['headline']], df['is_sarcastic'],random_state=0)
X_train.shape,X_test.shape

((33196, 1), (11066, 1))

In [None]:
import string
X_train['char_count'] = X_train['headline'].apply(len)
X_train['word_count'] = X_train['headline'].apply(lambda x: len(x.split()))
X_train['word_density'] = X_train['char_count'] / (X_train['word_count']+1)
X_train['punctuation_count'] = X_train['headline'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
X_train['title_word_count'] = X_train['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
X_train['upper_case_word_count'] = X_train['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))


X_test['char_count'] = X_test['headline'].apply(len)
X_test['word_count'] = X_test['headline'].apply(lambda x: len(x.split()))
X_test['word_density'] = X_test['char_count'] / (X_test['word_count']+1)
X_test['punctuation_count'] = X_test['headline'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
X_test['title_word_count'] = X_test['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
X_test['upper_case_word_count'] = X_test['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [None]:
pd.DataFrame(X_train).head()

Unnamed: 0,headline,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count
44223,"medicare for all is coming, no matter what the...",51,10,4.636364,1,0,0
479,dog born with odds stacked against her found j...,72,13,5.142857,0,0,0
32626,"this earth day, i stand for science",35,7,4.375,1,0,0
29016,some people are pissed off about the casting o...,72,13,5.142857,0,0,0
36834,even more evidence that anxiety can be genetic,46,8,5.111111,0,0,0


## Using RandomForest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier (criterion='gini',
                                n_estimators=10, 
                                random_state=1)

In [None]:
classifier.fit(X_train.drop(['headline'], axis=1), y_train)
predictions = classifier.predict(X_test.drop(['headline'], axis=1))

print(classification_report(y_test, predictions))
pd.DataFrame(confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

           0       0.64      0.72      0.68      6065
           1       0.60      0.52      0.56      5001

    accuracy                           0.63     11066
   macro avg       0.62      0.62      0.62     11066
weighted avg       0.63      0.63      0.62     11066



Unnamed: 0,0,1
0,4344,1721
1,2399,2602


## Loading and pre-processing test data

In [None]:
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Dphi/Mini Bootcamps/Test_Data.csv')

In [None]:
test_data.head()

Unnamed: 0,headline
0,area stand-up comedian questions the deal with...
1,dozens of glowing exit signs mercilessly taunt...
2,perfect response to heckler somewhere in prop ...
3,gop prays for ossoff lossoff
4,trevor noah says the scary truth about trump's...


In [None]:
test_data.shape

(11066, 1)

In [None]:
import string
test_data['char_count'] = test_data['headline'].apply(len)
test_data['word_count'] = test_data['headline'].apply(lambda x: len(x.split()))
test_data['word_density'] = test_data['char_count'] / (test_data['word_count']+1)
test_data['punctuation_count'] = test_data['headline'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
test_data['title_word_count'] = test_data['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
test_data['upper_case_word_count'] = test_data['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [None]:
test_data.head()

Unnamed: 0,headline,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count
0,area stand-up comedian questions the deal with...,65,9,6.5,2,0,0
1,dozens of glowing exit signs mercilessly taunt...,65,9,6.5,0,0,0
2,perfect response to heckler somewhere in prop ...,62,9,6.2,1,0,0
3,gop prays for ossoff lossoff,28,5,4.666667,0,0,0
4,trevor noah says the scary truth about trump's...,65,11,5.416667,1,0,0


In [None]:
prediction = classifier.predict(test_data.drop(['headline'], axis=1))

In [None]:
res = pd.DataFrame(prediction)
res.index = test_data.index
res.columns = ["prediction"]

from google.colab import files
res.to_csv('pred_new.csv')         
files.download('pred_new.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>