In [177]:
import sys

import h2o
import pandas as pd
import numpy as np
from scipy.stats import pearsonr #correlation
from sklearn.feature_extraction.text import TfidfVectorizer
import H2OGBMClassifier as hgb
from sklearn.metrics import roc_auc_score
import sys
from collections import defaultdict
import operator
#import sys

#reload(sys)  # Reload does the trick!
#sys.setdefaultencoding('utf-8')

#Dataset provided 
#file = './sentiment_m140_.csv'
#twitter_df = pd.read_csv(file)
#twitter_df = twitter_df.rename(columns={"target": "polarity", "ids": "id", "flag" : "query"})
#twitter_df["new_date"] = twitter_df["date"]

#Dataset Created
file = './sentiment_analysis_10k.csv' 
twitter_df = pd.read_csv(file,encoding = "ISO-8859-1")


print (twitter_df.shape)
#Update the target to 0 or 1 -- came in as 0 or 4 
twitter_df.loc[twitter_df["polarity"] == 4,"polarity"] = 1
print (" average target is %f "  % np.mean(twitter_df["polarity"]))

(200000, 7)
 average target is 0.500000 


In [179]:
# Split up positive and negative tweets into two dataframes
positive_tweets_df = twitter_df.loc[twitter_df["polarity"] ==1]
negative_tweets_df = twitter_df.loc[twitter_df["polarity"] ==0]

train_pos = positive_tweets_df[:int(len(positive_tweets_df)*0.8)]
test_pos = positive_tweets_df[int(len(positive_tweets_df)*0.8):]
train_neg = negative_tweets_df[:int(len(positive_tweets_df)*0.8)]
test_neg = negative_tweets_df[int(len(positive_tweets_df)*0.8):]

# combine positive and negative labels
train_X = pd.concat([train_pos ,train_neg]) 
test_X = pd.concat([test_pos ,test_neg])



In [180]:
# create our target arrays
train_y = train_X["polarity"]
test_y = test_X["polarity"]

In [181]:
# Remove the columns we're not using as features
train_X.drop(["polarity"],inplace=True, axis=1)
train_X.drop(["id"],inplace=True, axis=1)
train_X.drop(["date"],inplace=True, axis=1)
train_X.drop(["query"],inplace=True, axis=1)
train_X.drop(["user"],inplace=True, axis=1)
train_X.drop(["new_date"],inplace=True, axis=1)

test_X.drop(["polarity"],inplace=True, axis=1)
test_X.drop(["id"],inplace=True, axis=1)
test_X.drop(["date"],inplace=True, axis=1)
test_X.drop(["query"],inplace=True, axis=1)
test_X.drop(["user"],inplace=True, axis=1)
test_X.drop(["new_date"],inplace=True, axis=1)

In [182]:
#tf-idf model
tfv=TfidfVectorizer(min_df=0, max_features=3000, strip_accents='unicode',lowercase =True,
                              analyzer='word', token_pattern=r'\w{3,}', ngram_range=(1,1),
                              use_idf=True,smooth_idf=True, sublinear_tf=True, stop_words = "english")   
#h2o gbm model
model=hgb.H2OGBMClassifier (ntrees=100,
                            learn_rate=0.1,
                            distribution="bernoulli",
                            col_sample_rate=1.0,
                            col_sample_rate_per_tree =0.5,
                            nthread=15,
                            sample_rate=0.9,
                            stopping_metric="logloss",
                            nbins=255,
                            min_rows=1,
                            ram="20G",
                            max_depth=4,
                            seed=1)

#apply tf idf
data=tfv.fit_transform(train_X["text"].to_numpy())
data=data.toarray()
print (data.shape) 



Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,2 hours 27 mins
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.1.1
H2O_cluster_version_age:,1 month and 9 days
H2O_cluster_name:,H2O_from_python_ilan_u0rxu7
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,15.97 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,15


(160000, 3000)


In [192]:
test_data = tfv.fit_transform(test_X["text"].to_numpy())
test_data = test_data.toarray()
print(test_data.shape)

(40000, 3000)


In [None]:
#fit model
model.fit(data,np.array(train_y)) # feed target


In [None]:
#make predictions)probabilities) on tweets
preds=model.predict_proba(test_data)[:,1]

In [None]:
print ("training auc is %f" %roc_auc_score(test_y,preds) )

In [185]:
from sklearn.ensemble import GradientBoostingClassifier

# Create a classifier object
learning_rates = [0.1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=100,
                                            learning_rate=learning_rate,
                                            max_features=20,
                                            max_depth=4)

    # Fit the model
    classifier.fit(data, train_y)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            data,
            train_y)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            test_data,
            test_y)))

Learning rate:  0.05
Accuracy score (training): 0.699
Accuracy score (validation): 0.558
Learning rate:  0.1
Accuracy score (training): 0.702
Accuracy score (validation): 0.575


In [186]:
GBS_preds=classifier.predict_proba(test_data)[:,1]

In [187]:
# Make Prediction
gbs_predictions = classifier.predict(test_data)

In [188]:
test_X["target"] = test_y
test_X["GBS_Prediction"] = gbs_predictions
test_X["GBS_preds"] = GBS_preds

In [190]:
test_X.to_csv("test_results.csv")

In [193]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
# Calculating the accuracy score
acc_score = accuracy_score(test_y, gbs_predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.575325


In [195]:
# Generate the confusion matrix
cm = confusion_matrix(test_y, gbs_predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,6507,13493
Actual 1,3494,16506


In [196]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, max_depth=4, max_features =50)

In [197]:
# Fit the model
rfc.fit(data, train_y)

RandomForestClassifier(max_depth=4, max_features=50)

In [198]:
RFC_preds=rfc.predict_proba(test_data)[:,1]

In [199]:
# Make Prediction
RFC_predictions = rfc.predict(test_data)

In [200]:
test_X["target"] = test_y
test_X["RFC_Prediction"] = RFC_predictions
test_X["RFC_preds"] = RFC_preds

In [201]:
test_X.to_csv("test_results.csv")

In [202]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
# Calculating the accuracy score
acc_score = accuracy_score(test_y, RFC_predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.5792


In [203]:
# Generate the confusion matrix
cm = confusion_matrix(test_y, RFC_predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,6098,13902
Actual 1,2930,17070


In [204]:
test_X.to_csv("test_results.csv")