### Loading the data

In [None]:
import pandas as pd

# Load data
full_data = pd.read_csv('input/full_data_final.csv', index_col=0)

print(full_data.head())

### Importing libriries and splitting the full data into train/test

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
from matplotlib import pyplot as plt

# Data valid/train split
train_texts, valid_text, y_train, y_valid = train_test_split(
    full_data['content'], full_data['sarcastic'], random_state=42, test_size=0.1)

### Displaying the effect of log length of sarcastic/not sarcastic comments

In [None]:
import numpy as np

# Data analysis
full_data.loc[full_data['sarcastic'] == 1, 'content'].str.len().apply(np.log1p).hist(label='sarcastic', alpha=.5)
full_data.loc[full_data['sarcastic'] == 0, 'content'].str.len().apply(np.log1p).hist(label='normal', alpha=.5)
plt.legend();

### Displaying wordcloud for both sarcastic/not sarcastic comments

In [None]:
from wordcloud import WordCloud, STOPWORDS

# Word cloud
wordcloud = WordCloud(background_color='white', stopwords = STOPWORDS,
                max_words = 200, max_font_size = 100, 
                random_state = 17, width=800, height=400)

plt.figure(figsize=(16, 12))
wordcloud.generate(str(full_data.loc[full_data['sarcastic'] == 1, 'content']))
plt.imshow(wordcloud)

plt.figure(figsize=(16, 12))
wordcloud.generate(str(full_data.loc[full_data['sarcastic'] == 0, 'content']))
plt.imshow(wordcloud)

### Creating the pipeline (transformer + model)

In [None]:
print(len(train_texts))

# Model pipeline creation
tf_idf = TfidfVectorizer(ngram_range=(1, 3), max_features=500000, analyzer='word', max_df=0.8)

logit = LogisticRegression(C=1, n_jobs=-1, solver='lbfgs', random_state=0, verbose=1, penalty='l2')

tfidf_logit_pipeline = Pipeline([('tf_idf', tf_idf), ('logit', logit)])

### Fitting the data to the pipeline

In [None]:
# Model training
tfidf_logit_pipeline.fit(train_texts, y_train)

### Evaluation of the model on test set

In [None]:
# Evaluation
valid_pred = tfidf_logit_pipeline.predict(valid_text)

print(accuracy_score(y_valid, valid_pred))
print(classification_report(y_valid, valid_pred))

### Define a function to plot a pretty confusion matrix

In [None]:
# Conf matrix function
def plot_confusion_matrix(actual, predicted, classes,
                          normalize=False,
                          title='Confusion matrix', figsize=(7,7),
                          cmap=plt.cm.Blues, path_to_save_fig=None):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    import itertools
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(actual, predicted).T
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    plt.figure(figsize=figsize)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('Predicted label')
    plt.xlabel('True label')
    
    if path_to_save_fig:
        plt.savefig(path_to_save_fig, dpi=300, bbox_inches='tight')

### Plotting the confusion matrix

In [None]:
# Plot confusion matrix
plot_confusion_matrix(y_valid, valid_pred, 
                      tfidf_logit_pipeline.named_steps['logit'].classes_, figsize=(8, 8))

### Using ELI5 library to outline important phrases for sarcasm

In [None]:
import eli5

# Sarcasm phrases detection
eli5.show_weights(estimator=tfidf_logit_pipeline.named_steps['logit'],
                  vec=tfidf_logit_pipeline.named_steps['tf_idf'])