In [1]:
import pandas as pd
from sqlalchemy.engine import URL
from sqlalchemy import create_engine
from sklearn.feature_extraction.text import CountVectorizer
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
from nltk.corpus import stopwords
from nltk import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder

train_original = pd.read_csv('./TrainingData/trainingdata2.csv')
train_original.columns = ['target','id','date','flag','user','text']

train=train_original[['id','text', 'target']]

del train_original
 
#region prepare stopwords list
sw = stopwords.words('english')

#remove useful words from the stopword list
sw.remove('not')
sw.remove('no')
sw.remove('nor')
sw.remove("won't")
sw.remove("wouldn't")
sw.remove("shouldn't")
sw.remove("couldn't")
sw.remove('against')
sw.remove("aren't")
sw.remove("haven't")
sw.remove("hasn't")
sw.remove("doesn't")
sw.remove("isn't")
#endregion

def remove_pattern(text,pattern):
    # re.findall() finds the pattern i.e @user and puts it in a list for further task
    r = re.findall(pattern,text)
    
    # re.sub() removes @user from the sentences in the dataset
    for i in r:
        text = re.sub(i,"",text)
    
    return text

def stem_sentences(sentence):
    #tokenize the sentence and remove the stems of the words
    ps = PorterStemmer()
    tokens = sentence.split()
    stemmed_tokens = [ps.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

def preprocessTweet(df, sw):
    #remove newlines
    df['text'] = df['text'].str.replace("\n"," ")
    #turn all text to lowercase
    df['text'] = df['text'].str.lower()
    # remove twitter handles (@user)
    df['text'] = np.vectorize(remove_pattern)(df['text'], "@[\w]*")
    #remove links
    df['text'] = df['text'].str.replace('http\S+|www.\S+', '', case=False)
    #remove special characters, numbers, punctuations
    df['text'] = df['text'].str.replace("[^a-zA-Z#]", " ")
    #remove short words (length < 3)
    df['text'] = df['text'].apply(lambda x: ' '.join([w for w in x.split() if (len(w)>3 or w == 'no')]))
    #remove duplicate tweets - bot prevention
    df['text'] = df['text'].drop_duplicates(keep=False)
    #remove quotes
    df['text'] = df['text'].str.replace("quot", "")
    #remove NANs
    df.dropna(inplace=True)
    #remove stopwords
    df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw)]))
    #remove empty tweets
    df = df[df.text != '']
    #stemming
    df['text'] = df['text'].apply(stem_sentences)
    return df

train = preprocessTweet(train, sw)

  df['text'] = df['text'].str.replace('http\S+|www.\S+', '', case=False)
  df['text'] = df['text'].str.replace("[^a-zA-Z#]", " ")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(stem_sentences)


In [2]:
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
bow = bow_vectorizer.fit_transform(train['text'])
df_bow = pd.DataFrame(bow.todense())

train_bow = bow[:]
train_bow.todense()

x_train_bow, x_valid_bow, y_train_bow, y_valid_bow = train_test_split(train_bow,train['target'],test_size=0.3,random_state=42)

clf = tree.DecisionTreeClassifier(criterion='entropy')
clf.fit(x_train_bow, y_train_bow)
y_pred = clf.predict(x_valid_bow)

acc=accuracy_score(y_valid_bow,y_pred)
print(acc)

#save to csv
resultsdf = pd.DataFrame({'text': x_valid_bow, 'pred Sentiment': y_pred})
resultsdf.to_csv('results.csv', index=False)

0.6682434244261809


: 

In [None]:
tfidf = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
tfidf_matrix = tfidf.fit_transform(train['text'])
df_tfidf = pd.DataFrame(tfidf_matrix.todense())

train_tfidf_matrix = tfidf_matrix[:]
train_tfidf_matrix.todense()

x_train_tfidf, x_valid_tfidf, y_train_tfidf, y_valid_tfidf = train_test_split(train_tfidf_matrix,train['target'],test_size=0.3,random_state=42)

dct = tree.DecisionTreeClassifier(criterion='entropy')
dct.fit(x_train_tfidf,y_train_tfidf)
dct_tfidf = dct.predict(x_valid_tfidf)

acc2=accuracy_score(y_valid_tfidf,dct_tfidf)
print(acc2)

0.6677941543136987


In [None]:
import pickle

#save models
with open('DTBOWmodel.pkl','wb') as f:
    pickle.dump(clf,f)
    
with open('DTTFIDFmodel.pkl','wb') as f:
    pickle.dump(dct,f)

In [None]:
#load models
with open('DTBOWmodel.pkl', 'rb') as f:
    clf = pickle.load(f)
    
with open('DTTFIDFmodel.pkl','rb') as f:
    dct = pickle.load(f)

NameError: name 'pickle' is not defined

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

std_slc = StandardScaler()
pca = decomposition.PCA()

pipe = Pipeline(steps=[('std_slc', std_slc),('pca', pca), ('clf', clf)])

X = train_bow
y = train['target']

n_components = list(range(1,X.shape[1]+1,1))
criterion = ['gini', 'entropy']
max_depth = [2,4,6,8,10,12]
parameters = dict(pca__n_components=n_components, clf__criterion=criterion, clf__max_depth=max_depth)

sc = StandardScaler(with_mean=False)
X_train = sc.fit_transform(x_train_bow)

clf_GS = GridSearchCV(pipe, parameters)
clf_GS.fit(X_train, y_train_bow)

print('best Criterion:', clf_GS.best_estimator_.get_params()['clf__criterion'])
print('Best max_depth:', clf_GS.best_estimator_.get_params()['clf__max_depth'])
print('Best no of components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print()
print(clf_GS.best_estimator_.get_params()['clf'])

ValueError: 
All the 60000 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60000 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\andre\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\andre\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\pipeline.py", line 378, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\andre\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\pipeline.py", line 336, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\andre\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\andre\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\pipeline.py", line 870, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Users\andre\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\base.py", line 870, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "C:\Users\andre\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\preprocessing\_data.py", line 809, in fit
    return self.partial_fit(X, y, sample_weight)
  File "C:\Users\andre\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\preprocessing\_data.py", line 872, in partial_fit
    raise ValueError(
ValueError: Cannot center sparse matrices: pass `with_mean=False` instead. See docstring for motivation and alternatives.


In [3]:
trainList = []
validList = []

for max_d in range(1,30):
  model = tree.DecisionTreeClassifier(max_depth=max_d, random_state=42)
  model.fit(x_train_bow, y_train_bow)
  trainList.append(model.score(x_train_bow, y_train_bow))
  validList.append(model.score(x_valid_bow, y_valid_bow))
  print(len(trainList))

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29


In [4]:
print(max(validList))
print(max(trainList))

0.6259061848734752
0.6360614852345515
