In [None]:
import os

import pandas as pd
import numpy as np

from time import time

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

In [None]:
# config
DATA_DIR = ''
TRAIN_DATA_FILE = 'train.csv'
TEST_DATA_FILE = 'test.csv'
SUBMISSION_FILE = 'submission_lr.csv'

RANDOM_STATE = 0

In [None]:
!ls

sample_data  train.csv


In [None]:
train_data = pd.read_csv(os.path.join(DATA_DIR, TRAIN_DATA_FILE))
# test_data = pd.read_csv(os.path.join(DATA_DIR, TEST_DATA_FILE))

In [None]:
train_data = train_data.dropna()
# strip extra spaces
for var in ['text', 'selected_text']:
    train_data[var] = train_data[var].map(lambda text: ' '.join(text.split()))

# test_data['text'] = train_data['text'].map(lambda text: ' '.join(text.split()))

In [None]:
# create 2 target columns for 2 models 
starts = []
ends = []
for text, selected_text in zip(train_data['text'], train_data['selected_text']):
  start = text.find(selected_text)
  starts.append(start)
  ends.append(start + len(selected_text))

train_data['starts'] = starts
train_data['ends'] = ends

train_data.head(3)

Unnamed: 0,textID,text,selected_text,sentiment,starts,ends
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,0,35
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,0,8
2,088c60f138,my boss is bullying me...,bullying me,negative,11,22


In [None]:
train_data['combined'] = train_data['text'] + ' ' + train_data['sentiment']
train_data.head(3)

Unnamed: 0,textID,text,selected_text,sentiment,starts,ends,combined
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,0,35,"I`d have responded, if I were going neutral"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,0,8,Sooo SAD I will miss you here in San Diego!!! ...
2,088c60f138,my boss is bullying me...,bullying me,negative,11,22,my boss is bullying me... negative


In [None]:
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

# # Transformers
# X_train, X_test, y_train, y_test = train_test_split(train_data['combined'],
#                                                     train_data['starts'],
#                                                     test_size=0.1,
#                                                     random_state=RANDOM_STATE)



# transform_pipe = Pipeline([('vect', CountVectorizer(tokenizer=lambda x: x.split())),
#                            ('reduce', TruncatedSVD(n_components=100))
#                          ]
#                          )

# transform_pipe.fit(X_train, y_train)

In [None]:
 # Model 1 - predicting starts
X_train, X_test, y_train, y_test = train_test_split(train_data['combined'],
                                                    train_data['starts'],
                                                    test_size=0.1,
                                                    random_state=RANDOM_STATE)

starts_pipe = Pipeline([
    ('vect', CountVectorizer(tokenizer=lambda x: x.split())),
    ('reduce', TruncatedSVD(
        n_components=2,
        random_state=RANDOM_STATE)
    ),
    ('clf', LogisticRegression(
        C=0.001,
        max_iter=1000,
        random_state=RANDOM_STATE)
    )
]
)

In [None]:
t0 = time()
starts_pipe.fit(X_train, y_train)
print(f'Done in {time() - t0} seconds')
# Done in 328.1036448478699 seconds
# Done in 97.93197679519653 seconds

Done in 37.172874450683594 seconds


In [None]:
# # Grid Search for Model 1
# params = {
#     'reduce__n_components': [2, 30, 50, 100, 500],
#     'clf__C': [0.001, 0.009, 0.01, 0.09, 1, 5, 10, 25],
#     'clf__class_weight': [None, 'balanced']
# }

# gs_starts_pipe = GridSearchCV(starts_pipe,
#                       params,
#                       cv=5,
#                     verbose=2,
#                       n_jobs=-1
#                      )

# t0 = time()
# gs_starts_pipe.fit(X_train, y_train)
# print(f'Done in {time() - t0} seconds')

# print(gs_starts_pipe.best_params_)
# # {'clf__C': 0.001, 'clf__class_weight': None, 'reduce__n_components': 2}

In [None]:
# Model 2 - predicting ends
X_train, X_test, y_train, y_test = train_test_split(train_data['combined'],
                                                    train_data['ends'],
                                                    test_size=0.1,
                                                    random_state=RANDOM_STATE)

ends_pipe = Pipeline([
    ('vect', CountVectorizer(tokenizer=lambda x: x.split())),
    ('reduce', TruncatedSVD(
        n_components=1000,
        random_state=RANDOM_STATE)
    ),
    ('clf', LogisticRegression(
        # C=0.09,
        max_iter=1000,
        random_state=RANDOM_STATE)
    )
]
)

In [None]:
# Train Model 2
t0 = time()
ends_pipe.fit(X_train, y_train)
print(f'Done in {time() - t0} seconds')
# Done in 282.0183570384979 seconds
# Done in 26.879960775375366 seconds
# in colab
# Done in 54.29371666908264 seconds

Done in 199.16251587867737 seconds


In [None]:
# # Grid Search for Model 2

# params = {
#     'reduce__n_components': [2, 30, 50, 100, 500],
#     'clf__C': [0.001, 0.009, 0.01, 0.09, 1, 5, 10, 25],
#     'clf__class_weight': [None, 'balanced']
# }

# gs_ends_pipe = GridSearchCV(ends_pipe,
#                       params,
#                       cv=5,
#                       verbose=1,
#                       n_jobs=-1
#                      )

# t0 = time()
# gs_ends_pipe.fit(X_train, y_train)
# print(f'Done in {time() - t0} seconds')

# print(gs_ends_pipe.best_params_)
# # Done in 14950.651699781418 seconds
# # {'clf__C': 0.09, 'clf__class_weight': None, 'reduce__n_components': 500}

In [None]:
def slice_text(text_start_end):
  text, start, end = text_start_end
  return text[start:end]

def jaccard(top_selected):
    str1, str2 = top_selected
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:

# pred_starts = starts_pipe.predict(X_test)
# pred_ends = ends_pipe.predict(X_test)

In [None]:
# Predict and evaluate
X_train, X_test, y_train, y_test = train_test_split(pd.concat([train_data['text'],
                                                              train_data['sentiment']],
                                                              axis=1),
                                                    train_data['selected_text'],
                                                    test_size=0.1,
                                                    random_state=RANDOM_STATE)

res = pd.DataFrame()
res['text'] = X_test['text']
res['sentiment'] = X_test['sentiment']
combined = X_test['text'] + ' ' + X_test['sentiment']
res['pred_starts'] = starts_pipe.predict(combined)  # predict starts
res['pred_ends'] = ends_pipe.predict(combined)      # predict ends

res['pred_select'] = res[['text', 'pred_starts', 'pred_ends']].apply(slice_text, axis=1)
res['true_select'] = y_test

res['score'] = res[['pred_select', 'true_select']].apply(jaccard, axis=1)

print(res.groupby('sentiment')['score'].mean())
print('\nTotal score:', res['score'].mean())

res.head(2)

# CountVectorizer -> LogisticRegression
# sentiment
# negative    0.200697
# neutral     0.718845
# positive    0.197406
# Name: score, dtype: float64

# Total score: 0.41764852064086405

# CntVec -> SVA
# sentiment
# negative    0.191827
# neutral     0.771205
# positive    0.172073
# Name: score, dtype: float64

# Total score: 0.42944328839420626

# gs_starts_pipe.best_params_)
# # {'clf__C': 0.001, 'clf__class_weight': None, 'reduce__n_components': 2}
# gs_ends_pipe.best_params_)
# # {'clf__C': 0.09, 'clf__class_weight': None, 'reduce__n_components': 500}
# sentiment
# negative    0.193586
# neutral     0.724848
# positive    0.172904
# Name: score, dtype: float64

# Total score: 0.4106744554080874

sentiment
negative    0.216373
neutral     0.775776
positive    0.208481
Name: score, dtype: float64

Total score: 0.44925569354279665


Unnamed: 0,text,sentiment,pred_starts,pred_ends,pred_select,true_select,score
20673,loves the nice weather and 7:30 exams,positive,0,8,loves th,loves,0.5
12581,"Okay, this weather isn`t 'cute sundress' frien...",neutral,0,94,"Okay, this weather isn`t 'cute sundress' frien...","Okay, this weather isn`t 'cute sundress' frien...",0.68
