In [0]:
cd "E:\Text Mining\PROJECT"

E:\Text Mining\PROJECT


In [0]:
import pandas as p
import numpy as np
import itertools
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
import re

In [0]:
def getCleanTweet(text):
    tok = WordPunctTokenizer()
    user_pattern = '@[A-Za-z0-9_]+'
    http_pattern = 'https?://[^ ]+'
    www_pattern = 'www.[^ ]+'
    combined_pattern = '|'.join((user_pattern, http_pattern, www_pattern))
    negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                    "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                    "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                    "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                    "mustn't":"must not"}
    neg_pattern = re.compile('\b(' + '|'.join(negations_dic.keys()) + ')\b')
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    try:
        bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        bom_removed = souped
    stripped = re.sub(combined_pattern, '', bom_removed)
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], stripped)
    letters_only = re.sub("[^a-zA-Z]", " ", neg_handled).lower()
    cleaned = (" ".join(x for x in tok.tokenize(letters_only) if len(x) > 1)).strip()
    cleaned = ''.join(k + k if sum(1 for i in g) > 1 else k for k, g in itertools.groupby(cleaned))
    return cleaned

In [3]:
data = p.read_csv("tweet.csv", encoding="latin-1", names=["target", "ids", "date", "flag", "user", "text"])
data['clean_text'] = [getCleanTweet(text) for text in data['text']]
data.dropna(inplace=True)
data.reset_index(drop=True,inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 7 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   target      1600000 non-null  int64 
 1   ids         1600000 non-null  int64 
 2   date        1600000 non-null  object
 3   flag        1600000 non-null  object
 4   user        1600000 non-null  object
 5   text        1600000 non-null  object
 6   clean_text  1600000 non-null  object
dtypes: int64(2), object(5)
memory usage: 85.4+ MB


In [0]:
x_values = data['clean_text'].values
y_values = data['target'].values
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_values, y_values, test_size=0.10, random_state=0)

In [0]:
# Create 4 models - 
# tf unigram,  
# tf idf unigram, 
# tf unigram+bigram, 
# tf idf unigram+bigram
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
#TF unigram

tf_unigram = CountVectorizer(encoding='latin-1',binary=True,min_df = 5 )
tf_idf_unigram = TfidfVectorizer(encoding='latin-1',use_idf=True,min_df = 5)
tf_uni_bi = CountVectorizer(encoding='latin-1',binary=True, ngram_range=(1,2),min_df = 5)
tf_idf_uni_bi = TfidfVectorizer(encoding='latin-1',use_idf=True, ngram_range=(1,2),min_df = 5)

In [0]:
sample = data.sample(frac =.2) 

x_values = sample['clean_text'].values
y_values = sample['target'].values


In [0]:
tr_idf_vec = tf_idf_uni_bi.fit_transform(x_values)
ts_idf_vec = tf_idf_uni_bi.transform(x_test)

In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [10,80, 90],
    'max_features': ['auto'],
    'min_samples_leaf': [4],
    'n_estimators': [50, 100, 200]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 2, n_jobs = -1, verbose = 2)

In [21]:
grid_search.fit(tr_idf_vec, y_values)

Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed: 143.9min finished


GridSearchCV(cv=2, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [22]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 90,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'n_estimators': 200}

In [26]:
grid_search.score(ts_idf_vec, y_test)

0.776875