In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold

import xgboost as xgb
from xgboost import XGBClassifier

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import FreqDist

In [114]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\33679\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [20]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding="ISO-8859-1", names=["target", "ids", "date", "flag", "user", "text"])

In [None]:
df = df.drop(['ids', 'date', 'flag', 'user'], axis = 'columns')

In [23]:
df['text_processed'] = df['text'].str.replace(r'@\S+', '')
df['text_processed'] = df['text_processed'].str.replace(r'http\S+','')
df['text_processed'] = df['text_processed'].str.replace('[^a-zA-Z]',' ')
df['text_processed'] = df['text_processed'].str.lower()
df['text_processed'] = df['text_processed'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

In [24]:
lemmatizer = WordNetLemmatizer()

# Finds the part of speech tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# lemmatize sentence using pos tag
def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)


df['text_processed'] = df['text_processed'].apply(lambda x: lemmatize_sentence(x))

KeyboardInterrupt: 

In [None]:
df

In [6]:
X = df['text']
y = df['target']

In [7]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [9]:
vec = CountVectorizer()
X_train_transformed = vec.fit_transform(X_train)
X_test_transformed = vec.transform(X_test)

In [13]:
model = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',silent=True, nthread=1)

In [17]:
folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)
random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X,y), verbose=3, random_state=1001)

random_search.fit(X_train_transformed, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


IndexError: index (1599999) out of range

In [15]:
print(random_search.best_estimator_)

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_estimator_'

In [None]:
model.fit(X_train_transformed, y_train)

In [85]:
y_pred = model.predict(X_test_transformed)

In [86]:
accuracy_score(y_test, y_pred)

0.7591193181818182

In [57]:
model.predict(vec.transform(pd.Series('')))

array([4], dtype=int64)

In [54]:
vector = TfidfVectorizer(ngram_range=(1, 2))

In [56]:
X_training = vector.fit_transform(X_train)
X_testing = vector.transform(X_test)

In [57]:
model = LinearSVC()
model.fit(X_training, y_train)

LinearSVC()

In [58]:
y_prediction = model.predict(X_testing)
accuracy = accuracy_score(y_test, y_prediction)

In [59]:
accuracy

0.8215321969696969