In [1]:
from sklearn import model_selection, preprocessing, metrics, svm, ensemble
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd, numpy, string
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
#Remove Special Charactors
import re

In [2]:
#Import Training and Testing Data
train = pd.read_csv('train_E6oV3lV.csv')
print("Training Set:"% train.columns, train.shape, len(train))
test = pd.read_csv('test_tweets_anuFYb8.csv')
print("Test Set:"% test.columns, test.shape, len(test))

Training Set: (31962, 3) 31962
Test Set: (17197, 2) 17197


In [3]:
#Tokenize words in order to clean and stem
tok = WordPunctTokenizer()
# patterns to remove html tags numbers and special Characters
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))
porter=PorterStemmer()
negations_dic = {"isn't":"is not", "isnt":"is not", "aren't":"are not", "wasn't":"was not","wasnt":"was not", "weren't":"were not",
                "haven't":"have not","havent":"have not","hasn't":"has not","hadn't":"had not","won't":"will not","wont":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","cant":"can not","couldn't":"could not","couldnt":"could not","shouldn't":"should not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')
def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
#    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], lower_case)
#    letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)
#    words = [x for x  in tok.tokenize(letters_only) if len(x) > 1]
#    joins  =" ".join(words).strip()
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # Tokenize and join together to remove unnecessary white spaces
#    print(joins)
    words1 = tok.tokenize(lower_case)
    #Stemming
    stem_sentence=[]
    for word in words1:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    #Rejoin the words back to create the cleaned tweet
    words=" ".join(stem_sentence).strip()  
    spell_corrected = re.sub(r'(.)\1+', r'\1\1', words)
    return spell_corrected

In [4]:
nums = [0,len(train)]
clean_tweet_texts = []
for i in range(nums[0],nums[1]):
    clean_tweet_texts.append(tweet_cleaner(train['tweet'][i]))
nums = [0,len(test)]
test_tweet_texts = []
for i in range(nums[0],nums[1]):
    test_tweet_texts.append(tweet_cleaner(test['tweet'][i]))
#Dataframes to represent the testing and training samples
train_clean = pd.DataFrame(clean_tweet_texts,columns=['tweet'])
train_clean['label'] = train.label
train_clean['id'] = train.id
test_clean = pd.DataFrame(test_tweet_texts,columns=['tweet'])
test_clean['id'] = test.id

In [5]:
test_clean['tidy_tweet'] = test_clean['tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [6]:
train_clean['tidy_tweet'] = train_clean['tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [7]:
train_clean.head()

Unnamed: 0,tweet,label,id,tidy_tweet
0,when a father is dysfunct and is so se...,0,1,when father dysfunct selfish drag into dysfunct
1,thank for lyft credit i can t use caus...,0,2,thank lyft credit caus they offer wheelchair d...
2,bihday your majesti,0,3,bihday your majesti
3,model i love u take with u all the ti...,0,4,model love take with time
4,factsguid societi now motiv,0,5,factsguid societi motiv


In [8]:
del test_clean['tweet']
del train_clean['tweet']
test_clean.columns = ['id','tweet']
train_clean.columns = ['label','id','tweet']

In [10]:
train_clean.head()

Unnamed: 0,label,id,tweet
0,0,1,when father dysfunct selfish drag into dysfunct
1,0,2,thank lyft credit caus they offer wheelchair d...
2,0,3,bihday your majesti
3,0,4,model love take with time
4,0,5,factsguid societi motiv


In [11]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train_clean['tweet'],train_clean['label'])
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)
valid_y.shape

(7991,)

In [12]:
#Now working with Real challenge Data
train_x=train_clean['tweet']
valid_x=test_clean['tweet']
train_y=train_clean['label']
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words='english', max_features=100000,max_df=0.7)
tfidf_vect.fit(train_clean['tweet'])
tfidf_vect.fit(test_clean['tweet'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

## Using BaggingClassifier on Decision Tree model

In [13]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [14]:
from sklearn import model_selection, preprocessing, metrics, svm, ensemble
dt_model = ensemble.ExtraTreesClassifier(n_estimators=200)
bag_dt_model = BaggingClassifier(dt_model, max_features=1.0, n_estimators=15, \
                                 random_state=1, bootstrap=True)

In [15]:
xtrain_tfidf.shape, train_y.shape, xvalid_tfidf.shape

((31962, 19237), (31962,), (17197, 19237))

In [16]:
bag_dt_model.fit(xtrain_tfidf, train_y)
predictedvalues = bag_dt_model.predict(xvalid_tfidf)
#If you want to check your results manually you may include the tweet:d={'id':test['id'],'Tweet':valid_x,'label':accuracy}
d={'id':test['id'],'label':predictedvalues}
df=pd.DataFrame(data=d)
df.to_csv("test_predictions.csv", index=False)

In [35]:
predictedvalues = bag_dt_model.predict(xvalid_tfidf)

In [18]:
predictedvalues

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [28]:
#If you want to check your results manually you may include the tweet:d={'id':test['id'],'Tweet':valid_x,'label':accuracy}
d={'id':test['id'],'label':predictedvalues}
df=pd.DataFrame(data=d)
df.to_csv("test_predictions.csv", index=False)

## Using grid search & multiple classifier

In [21]:
from sklearn.model_selection import GridSearchCV

param_values = {'n_estimators': [10, 20, 25, 30], 'base_estimator__max_leaf_nodes':[5, 10, 15, 20], 'base_estimator__max_depth':[3, 4, 5]}

dt_model = DecisionTreeClassifier()
bag_dt_model_grid = BaggingClassifier(base_estimator=dt_model, oob_score=True, bootstrap=True, random_state=1) 
# Use GridSearchCV() to determine best parameters
bc_grid = GridSearchCV(estimator=bag_dt_model_grid, param_grid=param_values, cv=20, n_jobs=-1)
bc_grid.fit(xtrain_tfidf, train_y)
best_params = bc_grid.best_params_
print(best_params)

KeyboardInterrupt: 

In [None]:
best_dt_model = DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=10, max_depth=3)                   
final_bag_dt_model = BaggingClassifier(base_estimator=best_dt_model, n_estimators=150, bootstrap=True, random_state=1, oob_score=True)