In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold

import xgboost as xgb
from xgboost import XGBClassifier

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import FreqDist

In [114]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\33679\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [14]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding="ISO-8859-1", names=["target", "ids", "date", "flag", "user", "text"])

In [15]:
df = df.drop(['ids', 'date', 'flag', 'user'], axis = 'columns')

In [16]:
df['text_processed'] = df['text'].str.replace(r'@\S+', '')
df['text_processed'] = df['text_processed'].str.replace(r'http\S+','')
df['text_processed'] = df['text_processed'].str.replace('[^a-zA-Z]',' ')
df['text_processed'] = df['text_processed'].str.lower()
df['text_processed'] = df['text_processed'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>1]))

In [None]:
lemmatizer = WordNetLemmatizer()

# Finds the part of speech tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# lemmatize sentence using pos tag
def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)


df['text_processed'] = df['text_processed'].apply(lambda x: lemmatize_sentence(x))

In [17]:
df

Unnamed: 0,target,text,text_processed
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww that bummer you shoulda got david carr of...
1,0,is upset that he can't update his Facebook by ...,is upset that he can update his facebook by te...
2,0,@Kenichan I dived many times for the ball. Man...,dived many times for the ball managed to save ...
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all....",no it not behaving at all mad why am here beca...
...,...,...,...
1599995,4,Just woke up. Having no school is the best fee...,just woke up having no school is the best feel...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...,thewdb com very cool to hear old walt interviews
1599997,4,Are you ready for your MoJo Makeover? Ask me f...,are you ready for your mojo makeover ask me fo...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...,happy th birthday to my boo of alll time tupac...


In [18]:
X = df['text_processed']
y = df['target']

In [7]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [20]:
vec = CountVectorizer()
X_train_transformed = vec.fit_transform(X_train)
X_test_transformed = vec.transform(X_test)

In [25]:
model = RandomForestClassifier()

In [None]:
folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)
random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X,y), verbose=3, random_state=1001)

random_search.fit(X_train_transformed, y_train)

In [None]:
print(random_search.best_estimator_)

In [None]:
model.fit(X_train_transformed, y_train)

In [None]:
y_pred = model.predict(X_test_transformed)

In [None]:
accuracy_score(y_test, y_pred)

In [57]:
model.predict(vec.transform(pd.Series('')))

array([4], dtype=int64)

In [54]:
vector = TfidfVectorizer(ngram_range=(1, 2))

In [56]:
X_training = vector.fit_transform(X_train)
X_testing = vector.transform(X_test)

In [57]:
model = LinearSVC()
model.fit(X_training, y_train)

LinearSVC()

In [58]:
y_prediction = model.predict(X_testing)
accuracy = accuracy_score(y_test, y_prediction)

In [59]:
accuracy

0.8215321969696969