In [None]:
import numpy as np
import pandas as pd

## Load Data

In [None]:
pollutor_tweets_df = pd.read_csv("/content/drive/Shareddrives/CSCI 5523 Group Project/Data/content_polluters_tweets_filtered.csv", header = 0)
legitimate_users_df = pd.read_csv("/content/drive/Shareddrives/CSCI 5523 Group Project/Data/legitimate_users_tweets_filtered.csv", header = 0)
dataset = legitimate_users_df.append(pollutor_tweets_df)

In [None]:
dataset.head()

Unnamed: 0,user_id,tweet_id,tweet,created_at,label
0,614,5873834688,I wish I had more free time. I'd LOVE to see you!,2009-11-19 18:16:40,0
1,614,5873809295,"Tonight, tomorrow. On the plane at 5 pm.",2009-11-19 18:15:42,0
2,614,5291252160,"I'm at Carlucci's in Salt Lake City, UT http:/...",2009-10-30 11:24:52,0
3,614,5205651441,@spam @JannetteDavid,2009-10-27 12:17:35,0
4,1038,5762418891,@dialupkid Mijn vriendin en ik hebben een geza...,2009-11-16 05:08:29,0


## Data Preprocessing

Remove duplicates

In [None]:
dataset.dropna(inplace=True)
dataset = dataset[dataset.tweet != '']
dataset.describe()

Unnamed: 0,user_id,tweet_id,label
count,4560727.0,4560727.0,4560727.0
mean,58215010.0,7809418000.0,0.4713784
std,38294530.0,3871802000.0,0.4991802
min,614.0,5218033.0,0.0
25%,24745340.0,5585464000.0,0.0
50%,49414590.0,5936497000.0,0.0
75%,84344060.0,9117617000.0,1.0
max,173767000.0,20145990000.0,1.0


Drop null values, only consider 20,000 random tweets as a precaution for performance. Hughdan will adjust this later.

In [None]:
#dataset_sample = dataset.sample(frac=1, random_state=1).head(20000)
dataset_sample = dataset
dataset_sample.head()

Unnamed: 0,user_id,tweet_id,tweet,created_at,label
0,614,5873834688,I wish I had more free time Id LOVE to see you,2009-11-19 18:16:40,0
1,614,5873809295,Tonight tomorrow On the plane at 5 pm,2009-11-19 18:15:42,0
2,614,5291252160,Im at Carluccis in Salt Lake City UT httpgowal...,2009-10-30 11:24:52,0
3,614,5205651441,spam JannetteDavid,2009-10-27 12:17:35,0
4,1038,5762418891,dialupkid Mijn vriendin en ik hebben een gezam...,2009-11-16 05:08:29,0


Remove Punctuation

In [None]:
import re

dataset_sample['tweet'] = dataset_sample['tweet'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
dataset_sample.dropna(inplace=True)
dataset_sample = dataset_sample[dataset_sample.tweet != '']
dataset_sample.describe()

Unnamed: 0,user_id,tweet_id,label
count,4559301.0,4559301.0,4559301.0
mean,58215440.0,7809652000.0,0.4714523
std,38296470.0,3871788000.0,0.4991844
min,614.0,5218033.0,0.0
25%,24741970.0,5585527000.0,0.0
50%,49414590.0,5936591000.0,0.0
75%,84348700.0,9118277000.0,1.0
max,173767000.0,20145990000.0,1.0


Determining top 10 characters that tweets start with

In [None]:
tweets = dataset_sample['tweet']
first_chars = tweets.apply(lambda x: x[0])

In [None]:
unique_chars = first_chars.unique()
char_counts = {}
for char in unique_chars:
  count = first_chars[first_chars == char].shape[0]
  char_counts[char] = count
char_counts

# sort by descending frequency
sorted_chars = sorted(char_counts, key=char_counts.get, reverse=True)
sorted_chars[0:10]

Convert to lowercase

In [None]:
dataset_sample['tweet'] = dataset_sample['tweet'].apply(lambda x: x.lower())

# Create 2-grams

In [None]:
words = []
for ii in range(0,len(dataset_sample)):
    words.append(str(dataset_sample.iloc[ii]['tweet']).split(" "))

n_gram_all = []

for word in words:
    # get n-grams for the instance
    n_gram = []
    for i in range(len(word)-2+1):
        n_gram.append("".join(word[i:i+2]))
    n_gram_all.append(n_gram)
    
n_gram_all[0][:10]


['iwish',
 'wishi',
 'ihad',
 'hadmore',
 'morefree',
 'freetime',
 'timeid',
 'idlove',
 'loveto',
 'tosee']

# Vectorizing with Hashing Vectorizer

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer

# hash vectorizer instance
hvec = HashingVectorizer(lowercase=False, analyzer=lambda l:l, n_features=2**12, alternate_sign= False, norm = None)

# features matrix X
X = hvec.fit_transform(n_gram_all)

# alternative
#hvec = HashingVectorizer(lowercase=True, n_features=2**12, alternate_sign= False, norm = None, ngram_range=(1,2))
#X = hvec.fit_transform(dataset_sample['tweet'])

print(X[0])

  (0, 205)	1.0
  (0, 473)	1.0
  (0, 564)	1.0
  (0, 1199)	1.0
  (0, 1252)	1.0
  (0, 1954)	1.0
  (0, 3063)	1.0
  (0, 3167)	1.0
  (0, 3213)	1.0
  (0, 3333)	1.0
  (0, 3511)	1.0


## Alternative tokenization using CountVectorizer()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
## Count Vectorizer 

cvec = CountVectorizer(ngram_range=(1,2), stop_words = 'english', max_features=2**13)

# Fit the data and then return the matrix
X = cvec.fit_transform(dataset_sample['tweet'])

Alternative tokenization using TFidfVectorizer()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfvec = TfidfVectorizer(ngram_range=(1,1), stop_words='english', max_features=2**13)

# Fit the data and then return the matrix
X = tfidfvec.fit_transform(dataset_sample['tweet'])

## Split data into training and testing splits

In [None]:
from sklearn.model_selection import train_test_split

# test set size of 20% of the data and the random seed 1
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), dataset_sample['label'], test_size=0.2, random_state=1)

In [None]:
# for large datasets use a train-test split index array
split_idx = np.arange(dataset_sample.shape[0])
np.random.seed(0)
np.random.shuffle(split_idx)
split = 8*dataset_sample.shape[0]//10
train_idx = split_idx[:split]
test_idx = split_idx[split:]

X_train = X[train_idx]
X_test = X[test_idx]
y_train = dataset_sample['label'].iloc[train_idx]
y_test = dataset_sample['label'].iloc[test_idx]

In [None]:
# Load analysis functions
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Multinomial Naive Bayes Classifier

Baseline Implementation - Train/Test Split

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
predictions = naive_bayes.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: {}'.format(accuracy_score(y_test, predictions)))
print('Precision score: {}'.format(precision_score(y_test, predictions)))
print('Recall score: {}'.format(recall_score(y_test, predictions)))
print('F1 score: {}'.format(f1_score(y_test, predictions)))

Accuracy score: 0.7759952902276609
Precision score: 0.7644890407393251
Recall score: 0.7583344188469984
F1 score: 0.7613992925764905


Evaluate GAN

In [None]:
generated_tweets_df = pd.read_csv("/content/drive/Shareddrives/CSCI 5523 Group Project/Data/GAN data/generated_tweets_1000.csv")
test = tfidfvec.transform(generated_tweets_df['generated tweets'])
label = generated_tweets_df['label']
gen_predictions = naive_bayes.predict(test)
print('Accuracy score: {}'.format(accuracy_score(label, gen_predictions)))
print('Precision score: {}'.format(precision_score(label, gen_predictions)))
print('Recall score: {}'.format(recall_score(label, gen_predictions)))
print('F1 score: {}'.format(f1_score(label, gen_predictions)))

Accuracy score: 0.072
Precision score: 1.0
Recall score: 0.072
F1 score: 0.1343283582089552


In [None]:
(gen_predictions == 1).sum()

72

## Decision Tree Classifier

Baseline Implementation

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=17)

Optimization

In [None]:
from sklearn.model_selection import GridSearchCV

tree_params = {'max_depth': range(5,12),
               'max_features': [5,10,15,20,25,30,40,50]}

tree_grid = GridSearchCV(tree, tree_params, cv=5, n_jobs=-1, verbose=True)

tree_grid.fit(X_train, y_train)

Fitting 5 folds for each of 56 candidates, totalling 280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   24.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed:  2.7min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=17,
                                              splitter='best'),
             iid='deprecated', n_jobs=-1,
             param_grid={'max_depth': range(5, 12),
                         'max_

In [None]:
tree_grid.best_params_, tree_grid.best_score_

({'max_depth': 11, 'max_features': 50}, 0.6981519663586521)

In [None]:
predictions = tree_grid.predict(X_test)
print('Accuracy score: {}'.format(accuracy_score(y_test, predictions)))
print('Precision score: {}'.format(precision_score(y_test, predictions)))
print('Recall score: {}'.format(recall_score(y_test, predictions)))
print('F1 score: {}'.format(f1_score(y_test, predictions)))

Accuracy score: 0.6982182676896023
Precision score: 0.785292847891783
Recall score: 0.49504773158658677
F1 score: 0.6072719231290528


## kNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
#knn = KNeighborsClassifier(n_neighbors=5)

## Finding optimal neighbors with 5-fold CV

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

#knn_pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_jobs=-1))])
knn_pipe = Pipeline([('knn', KNeighborsClassifier(n_jobs=-1))])

knn_params = {'knn__n_neighbors': [1]}

knn_grid = GridSearchCV(knn_pipe, knn_params,
                        cv=3, n_jobs=-1, verbose=True)

knn_grid.fit(X_train, y_train)

knn_grid.best_params_, knn_grid.best_score_

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
knn_grid.best_params_, knn_grid.best_score_

In [None]:
predictions = knn_grid.predict(X_test)
print('Accuracy score: {}'.format(accuracy_score(y_test, predictions)))
print('Precision score: {}'.format(precision_score(y_test, predictions)))
print('Recall score: {}'.format(recall_score(y_test, predictions)))
print('F1 score: {}'.format(f1_score(y_test, predictions)))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_params = {'n_estimators': [200],
               'max_depth': [150]}

rf = RandomForestClassifier()

rf_grid = GridSearchCV(rf, rf_params, cv=5, n_jobs=-1, verbose=True)

rf_grid.fit(X_train, y_train)

rf_grid.best_params_, rf_grid.best_score_ 

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.5min finished


({'max_depth': 150, 'n_estimators': 200}, 0.708601736904856)

In [None]:
predictions = rf_grid.predict(X_test)
print('Accuracy score: {}'.format(accuracy_score(y_test, predictions)))
print('Precision score: {}'.format(precision_score(y_test, predictions)))
print('Recall score: {}'.format(recall_score(y_test, predictions)))
print('F1 score: {}'.format(f1_score(y_test, predictions)))

Accuracy score: 0.7443733119935981
Precision score: 0.7431657104286197
Recall score: 0.7002331990672037
F1 score: 0.7210609616329203


## Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_reg = LogisticRegression(max_iter=1000)

lgr_params = {'C' : [1.0],#[0.8,1.0,1.2],
                  'tol' : [0.0001]}#[0.0001,0.001,0.01]}

lgr_grid = GridSearchCV(log_reg, lgr_params, cv=5, n_jobs=-1, verbose=True)

lgr_grid.fit(X_train, y_train)
lgr_grid.best_params_, lgr_grid.best_score_

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  5.4min finished


({'C': 1.0, 'tol': 0.0001}, 0.7657677713684118)

In [None]:
predictions = lgr_grid.predict(X_test)
print('Accuracy score: {}'.format(accuracy_score(y_test, predictions)))
print('Precision score: {}'.format(precision_score(y_test, predictions)))
print('Recall score: {}'.format(recall_score(y_test, predictions)))
print('F1 score: {}'.format(f1_score(y_test, predictions)))

Accuracy score: 0.7665587189275559
Precision score: 0.7744636289592303
Recall score: 0.7122406252907788
F1 score: 0.7420500180557474


## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
pred = lin_reg.predict(X_test)
pred[pred < 0.5] = 0
pred[pred >= 0.5] = 1
print('Accuracy score: {}'.format(accuracy_score(y_test.values, pred)))
print('Precision score: {}'.format(precision_score(y_test, pred)))
print('Recall score: {}'.format(recall_score(y_test, pred)))
print('F1 score: {}'.format(f1_score(y_test, pred)))

Accuracy score: 0.7656430091867071
Precision score: 0.7742420821080888
Recall score: 0.7098701963338606
F1 score: 0.7406601061383293
