# Mount Drive and Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [39]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn import ensemble 

import pandas as pd

In [3]:
from keras.models import Sequential
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasClassifier

# EDA and Model Building with Count Vectorizer

In [4]:
df_train = pd.read_csv("/content/drive/MyDrive/ML_datasets/sarcastic dataset/Train.csv")
df_train.head()

Unnamed: 0,ID,comment,date,down,parent_comment,score,top,topic,user,label
0,uid_590555,"Well, let's be honest here, they don't actuall...",2015-04,0,They should shut the fuck up and let the commu...,2,2,starcitizen,Combat_Wombatz,0
1,uid_671762,"Well, I didn't need evidence to believe in com...",2016-12,-1,You need evidence to kill people? I thought we...,6,-1,EnoughCommieSpam,starkadd,1
2,uid_519689,"Who does an ""official promo"" in 360p?",2013-11,0,2014 BMW S1000R: Official Promo,3,3,motorcycles,phybere,0
3,uid_788362,Grotto koth was the best,2015-09,0,Not really that memorable lol if you want memo...,2,2,hcfactions,m0xyMC,1
4,uid_299252,Neal's back baby,2015-11,0,James Neal hit on Zach Parise,-5,-5,hockey,Somuch101,1


In [61]:
df_train.shape

(15000, 10)

In [6]:
df_train.iloc[0]['comment']

"Well, let's be honest here, they don't actually seem to do much moderating, so they have to spend their time doing *something*."

In [7]:
df_train.iloc[2]['comment']

'Who does an "official promo" in 360p?'

In [9]:
df_train['label'].value_counts()

1    7527
0    7473
Name: label, dtype: int64

In [11]:
sentences = df_train[['comment']].values
y = df_train['label'].values

sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)

In [15]:
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train.ravel())

X_train = vectorizer.transform(sentences_train.ravel())
X_test  = vectorizer.transform(sentences_test.ravel())

In [36]:
classifier = LogisticRegression(max_iter=200)
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [37]:
score = classifier.score(X_test, y_test)
print("Accuracy:", score)

Accuracy: 0.6333333333333333


In [38]:
predictions = classifier.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.63      0.69      0.65      1904
           1       0.64      0.58      0.61      1846

    accuracy                           0.63      3750
   macro avg       0.63      0.63      0.63      3750
weighted avg       0.63      0.63      0.63      3750



In [44]:
#Ensemble Learning
classifier = ensemble.GradientBoostingClassifier(
    n_estimators = 200, #how many decision trees to build
    learning_rate = 2.0, #learning rate
    max_depth = 200
)

In [45]:
classifier.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=2.0, loss='deviance', max_depth=200,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=200,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [46]:
score = classifier.score(X_test, y_test)
print("Accuracy:", score)

Accuracy: 0.5976


In [47]:
predictions = classifier.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.60      0.64      0.62      1904
           1       0.60      0.55      0.57      1846

    accuracy                           0.60      3750
   macro avg       0.60      0.60      0.60      3750
weighted avg       0.60      0.60      0.60      3750



# Model Building with TF-IDF

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer

# convert all text to lowercase 
df_train['comment'] = df_train['comment'].str.lower()

In [55]:
sentences = df_train[['comment']].values
y = df_train['label'].values

sentences_train, sentences_test, y_train, y_test = train_test_split(
    sentences, y, test_size=0.25, random_state=1000)

In [58]:
tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(sentences_train.ravel())
X_test_tfidf = tfidf.transform(sentences_test.ravel())

In [59]:
classifier = LogisticRegression(max_iter=200)
classifier.fit(X_train_tfidf, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [60]:
predictions = classifier.predict(X_test_tfidf)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.64      0.70      0.67      1904
           1       0.66      0.60      0.63      1846

    accuracy                           0.65      3750
   macro avg       0.65      0.65      0.65      3750
weighted avg       0.65      0.65      0.65      3750



# Combining Features

In [63]:
df_train['combined_text'] = df_train['comment'] + " " + df_train['parent_comment']
df_train.fillna(" ",inplace = True)
df_train['combined_text'].head()

0    well, let's be honest here, they don't actuall...
1    well, i didn't need evidence to believe in com...
2    who does an "official promo" in 360p? 2014 BMW...
3    grotto koth was the best Not really that memor...
4       neal's back baby James Neal hit on Zach Parise
Name: combined_text, dtype: object

In [64]:
df_train['combined_text'] = df_train['combined_text'].str.lower()

In [65]:
sentences = df_train[['combined_text']].values
y = df_train['label'].values

sentences_train, sentences_test, y_train, y_test = train_test_split(
    sentences, y, test_size=0.25, random_state=1000)

In [66]:
tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(sentences_train.ravel())
X_test_tfidf = tfidf.transform(sentences_test.ravel())

In [67]:
classifier = LogisticRegression(max_iter=200)
classifier.fit(X_train_tfidf, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [68]:
predictions = classifier.predict(X_test_tfidf)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.60      0.62      0.61      1904
           1       0.60      0.58      0.59      1846

    accuracy                           0.60      3750
   macro avg       0.60      0.60      0.60      3750
weighted avg       0.60      0.60      0.60      3750



# Testing CSV for submission

In [23]:
df_test = pd.read_csv("/content/drive/MyDrive/ML_datasets/sarcastic dataset/Test.csv")
df_test.head()

Unnamed: 0,ID,comment,date,down,parent_comment,score,top,topic,user
0,uid_764784,Meanwhile if I pick my boxer up to move him to...,2016-01,0,Part of the Family Fun,2,2,gifs,Cameltoe-Swampdonkey
1,uid_67552,Thats what you get for using an unfair advanta...,2015-03,0,received a warning for hacking while overclock...,1,1,h1z1,Gothika_47
2,uid_240490,only to be later faced with thousands of dolla...,2010-09,0,Running shoes and race entries. I started runn...,1,1,Frugal,tells
3,uid_56568,What a thoughtful rebuttal to a well articulat...,2015-12,0,"Actually, you should get over your self. There...",-1,-1,CFB,YourToothbrush
4,uid_875860,"Yes, major life experiences that demand a grea...",2014-04,0,"As a bonus, it was with his ex, months after t...",-5,-5,AdviceAnimals,drunken_trophy_wife


In [32]:
df_test.columns

Index(['ID', 'comment', 'date', 'down', 'parent_comment', 'score', 'top',
       'topic', 'user'],
      dtype='object')

In [24]:
sentences_testing = df_test['comment'].values

In [25]:
X_testing = vectorizer.transform(sentences_testing)

In [26]:
pred_test = classifier.predict(X_testing)
pred_test

array([0, 1, 1, ..., 1, 0, 0])

# Submission to leaderboard

In [27]:
submission = pd.read_csv("/content/drive/MyDrive/ML_datasets/sarcastic dataset/sample_submission.csv")
submission.head()

Unnamed: 0,ID,label
0,uid_764784,0
1,uid_67552,0
2,uid_240490,0
3,uid_56568,1
4,uid_875860,1


In [28]:
submission = pd.DataFrame(columns=submission.columns)

In [29]:
submission['label'] = pred_test
submission['ID']=df_test['ID']

In [30]:
submission.head()

Unnamed: 0,ID,label
0,uid_764784,0
1,uid_67552,1
2,uid_240490,1
3,uid_56568,0
4,uid_875860,0


In [31]:
submission.shape

(8000, 2)

In [None]:
pd.DataFrame(submission).to_csv('my_sub_1.csv', index=False)