In [None]:
!ls ../input/sarcasm/

In [1]:
import os
import gc
import numpy as np
import pandas as pd
from scipy import sparse
from collections import defaultdict
from wordcloud import WordCloud, STOPWORDS


from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import eli5
import lightgbm as lgb

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999

In [2]:
train_df = pd.read_csv('../input/sarcasm/train-balanced-sarcasm.csv')

In [3]:
k = pd.DataFrame()
k['train'] = train_df.isnull().sum()
k

In [4]:
train_df.fillna(' ',inplace=True)
gc.collect()

In [5]:
train_df.dropna(subset=['comment'], inplace=True)

In [6]:
train_texts, test_texts, y_train, y_test = \
        train_test_split(train_df['comment'], train_df['label'], random_state=17)

In [7]:
## target count ##
cnt_srs = train_df['label'].value_counts()
trace = go.Bar(
    x=cnt_srs.index,
    y=cnt_srs.values,
    marker=dict(
        color=cnt_srs.values,
        colorscale = 'Picnic',
        reversescale = True
    ),
)

layout = go.Layout(
    title='Target Count',
    font=dict(size=18)
)

data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="TargetCount")

## target distribution ##
labels = (np.array(cnt_srs.index))
sizes = (np.array((cnt_srs / cnt_srs.sum())*100))

trace = go.Pie(labels=labels, values=sizes)
layout = go.Layout(
    title='Target distribution',
    font=dict(size=18),
    width=600,
    height=600,
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="usertype")

In [8]:
stopwords = set(STOPWORDS)

train1_df = train_df[train_df["label"]==1]
train0_df = train_df[train_df["label"]==0]

## custom function for ngram generation ##
def generate_ngrams(text, n_gram=1):
    token = [token for token in text.lower().split(" ") if token != "" if token not in STOPWORDS]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [" ".join(ngram) for ngram in ngrams]

## custom function for horizontal bar chart ##
def horizontal_bar_chart(df, color):
    trace = go.Bar(
        y=df["word"].values[::-1],
        x=df["wordcount"].values[::-1],
        showlegend=False,
        orientation = 'h',
        marker=dict(
            color=color,
        ),
    )
    return trace

## Get the bar chart from sincere questions ##
freq_dict = defaultdict(int)
for sent in train0_df["comment"]:
    for word in generate_ngrams(sent):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace0 = horizontal_bar_chart(fd_sorted.head(50), 'blue')

## Get the bar chart from insincere questions ##
freq_dict = defaultdict(int)
for sent in train1_df["comment"]:
    for word in generate_ngrams(sent):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace1 = horizontal_bar_chart(fd_sorted.head(50), 'blue')

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04,
                          subplot_titles=["Frequent words of not sarcastic comments", 
                                          "Frequent words of sarcastic comments"])
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig['layout'].update(height=1200, width=900, paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
py.iplot(fig, filename='word-plots')

In [9]:
train_texts = list(train_texts)
y_train = list(y_train)

test_texts = list(test_texts)
y_test = list(y_test)

In [10]:
vectorizer = TfidfVectorizer(max_features=20000, stop_words= 'english')
vectorizer.fit(train_texts)
embedded_texts_train = vectorizer.transform(train_texts)
embedded_texts_test = vectorizer.transform(test_texts)

In [11]:
lr = LogisticRegression(solver='liblinear')
lr.fit(embedded_texts_train, y_train);

In [12]:
lr_predictions = lr.predict(embedded_texts_test)
accuracy_score(lr_predictions, y_test)

In [14]:
feature_names = vectorizer.get_feature_names()
target_names = ["unsarcastic", "sarcastic"]

In [16]:
eli5.show_weights(lr,
                  vec=vectorizer,
                  top=10,
                  feature_names=feature_names,
                  target_names=target_names
)