In [3]:
import os
import pandas as pd
import numpy as np
import sklearn
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem import WordNetLemmatizer

In [4]:
np.random.seed(42)

In [8]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [9]:
# Drop duplicates where targets are not confusing
train = train.drop_duplicates(subset = ['text', 'target'])
# Now, we have only duplicates with different values of target variable for the same message. Drop all of them
rep = pd.concat(x for _, x in train.groupby('text') if len(x) > 1)
lst = [rep.loc[i, 'id'] for i in rep.index]
train = train[~train.id.isin(lst)]

In [11]:
train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()

In [12]:
# Do lemmatization 
lemmatizer = WordNetLemmatizer()
for i in train.index:
    train.loc[i, 'text'] = str([lemmatizer.lemmatize(w, pos = 'v') for w in nltk.word_tokenize(train.text[i])])
for i in test.index:
    test.loc[i, 'text'] = str([lemmatizer.lemmatize(w, pos = 'v') for w in nltk.word_tokenize(test.text[i])])

In [13]:
# Remove StopWords
words = ['the', 'a', 'an', 'or', 'and']
pat = r'\b(?:{})\b'.format('|'.join(words))

train['text'] = train['text'].str.replace(pat, '', regex = True)
test['text'] = test['text'].str.replace(pat, '', regex = True)
train['text'] = train['text'].apply(lambda x: re.sub(r'(\d+),(\d+)', r'\1\2', x))
test['text'] = test['text'].apply(lambda x: re.sub(r'(\d+),(\d+)', r'\1\2', x))

'\\b(?:the|a|an|or|and)\\b'

In [14]:
# Remove punctuation except exclamation and question marks
c = set(string.punctuation)
p_to_exclude = ['?', '!']
c = c.difference(p_to_exclude)
train.text = train.text.replace(r'[{}]'.format(re.escape(''.join(c))), '', regex=True)
test.text = test.text.replace(r'[{}]'.format(re.escape(''.join(c))), '', regex=True)

In [15]:
# Classes are slightly imbalanced
print(train.groupby('target').id.count())
print(round(train.groupby('target').id.count()[0] / train.id.count() * 100, 0) , ':', round(train.groupby('target').id.count()[1] / train.id.count() * 100))

target
0    4297
1    3188
Name: id, dtype: int64
57.0 : 43.0


In [16]:
# Do vectorization 
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train['text'])
test_vectors = vectorizer.transform(test['text'])

In [17]:
X_train, X_test, y_train, y_test = train_test_split(train_vectors, train.target, test_size = 0.3) 

In [None]:
### SVM ###

In [18]:
clf = svm.SVC(kernel = 'linear') 

In [19]:
scores = model_selection.cross_val_score(clf, X_train, y_train, cv = 5, scoring= 'f1')
scores

array([0.77184466, 0.74189676, 0.72952854, 0.72243346, 0.72590738])

In [20]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

SVC(kernel='linear')

In [22]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("F1:",metrics.f1_score(y_test, y_pred))

Accuracy: 0.8076580587711487
Precision: 0.837730870712401
Recall: 0.6726694915254238
F1: 0.7461809635722679


In [26]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission["target"] = clf.predict(test_vectors)
sample_submission.to_csv("submission191.csv", index=False)

In [None]:
# FINAL SCORE 0.79650