# Tweets sentiment analysis

##### By Pengyuan Ding

In [1]:
# import packages
import pandas as pd
import pickle
import numpy as np

from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# pre-processer(TFIDF)
from sklearn.feature_extraction.text import TfidfVectorizer
# learners
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# set file directors

f_dev_tfidf = 'Data/tfidf-tweets/tfidf/dev_tfidf.pkl'
f_train_tfidf = 'Data/tfidf-tweets/tfidf/train_tfidf.pkl'
f_un_tfidf= 'Data/tfidf-tweets/tfidf/unlabeled_tfidf.pkl'
f_test_tfidf= 'Data/tfidf-tweets/tfidf/test_tfidf.pkl'

f_dev_emb = 'Data/embedding-tweets/sentence-transformers/dev_emb.pkl'
f_train_emb = 'Data/embedding-tweets/sentence-transformers/train_emb.pkl'
f_un_emb = 'Data/embedding-tweets/sentence-transformers/unlabeled_emb.pkl'
f_test_emb= 'Data/embedding-tweets/sentence-transformers/test_emb.pkl'

f_dev_raw = 'Data/raw-tweets/tweets-data/dev.pkl'
f_train_raw = 'Data/raw-tweets/tweets-data/train.pkl'
f_un_raw = 'Data/raw-tweets/tweets-data/unlabeled.pkl'
f_test_raw = 'Data/raw-tweets/tweets-data/test.pkl'

f_un_pred_AAE_raw= 'Data/raw-tweets/tweets-data/unlabeled_pred_AAE_raw.pkl'

f_dev_mpnet = 'Data/embedding-tweets/mpnet/dev_mpnet.pkl'
f_train_mpnet = 'Data/embedding-tweets/mpnet/train_mpnet.pkl'
f_un_mpnet = 'Data/embedding-tweets/mpnet/unlabeled_mpnet.pkl'
f_test_mpnet = 'Data/embedding-tweets/mpnet/test_mpnet.pkl'

f_un_pred_AAE_mpnet = 'Data/embedding-tweets/mpnet/unlabeled_pred_AAE_mpnet.pkl'

## Separate TFIDF

In [3]:
raw = [f_dev_raw, f_train_raw, f_un_raw, f_test_raw]
set_name = ['dev', 'train', 'unlabeled', 'test']
table_list = []
for n in range(len(raw)):
    with open(raw[n],'rb') as f:
        df = pickle.load(f)
        df['Set'] = set_name[n]
        table_list.append(df)
s = pd.concat(table_list).reset_index()

grouped = s.groupby(s.Demographic)  # separate
SAE = grouped.get_group("SAE")
AAE = grouped.get_group("AAE")
SAE = SAE.reset_index()
AAE = AAE.reset_index()

In [4]:
vectorizer_SAE = TfidfVectorizer(stop_words = 'english', max_features=1000)
TFIDF_SAE = vectorizer_SAE.fit_transform(SAE.text)
TFIDF_SAE = TFIDF_SAE.toarray()
words_SAE = vectorizer_SAE.get_feature_names_out()

for i in range(TFIDF_SAE.shape[0]):
    SAE.text[i] = TFIDF_SAE[i]
SAE = SAE.rename({'text': 'TFIDF'}, axis=1)

SAE_dev = SAE.loc[(SAE['Set'] == 'dev'), ['TFIDF','Sentiment','Demographic']]
SAE_dev = SAE_dev.reset_index(drop=True)
SAE_train = SAE.loc[(SAE['Set'] == 'train'), ['TFIDF','Sentiment','Demographic']]
SAE_train = SAE_train.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SAE.text[i] = TFIDF_SAE[i]


In [5]:
vectorizer_AAE = TfidfVectorizer(stop_words = 'english', max_features=1000)
TFIDF_AAE = vectorizer_AAE.fit_transform(AAE.text)
TFIDF_AAE = TFIDF_AAE.toarray()
words_AAE = vectorizer_AAE.get_feature_names_out()

for i in range(TFIDF_AAE.shape[0]):
    AAE.text[i] = TFIDF_AAE[i]
AAE = AAE.rename({'text': 'TFIDF'}, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  AAE.text[i] = TFIDF_AAE[i]


In [6]:
comm =  list(set(words_AAE).intersection(words_SAE))
print('overlapping:',len(comm))
AAE_uniq = [i for i in words_AAE if (not i in comm)]
SAE_uniq = [i for i in words_SAE if (not i in comm)]

overlapping: 595


## Baseline (Majority-class)

In [7]:
# Separate train data set into two demographic groups
with open(f_train_mpnet,'rb') as f:
    table = pickle.load(f) 
grouped = table.groupby(table.Demographic)
SAE = grouped.get_group("SAE")
AAE = grouped.get_group("AAE")

X_train_SAE = np.stack(SAE.TFIDF)
y_train_SAE = np.stack(SAE.Sentiment)
X_train_AAE = np.stack(AAE.TFIDF)
y_train_AAE = np.stack(AAE.Sentiment)

MC_clf = DummyClassifier(strategy="most_frequent")
scores_SAE = cross_val_score(MC_clf, X_train_SAE, y_train_SAE, cv=10)
scores_AAE = cross_val_score(MC_clf, X_train_AAE, y_train_SAE, cv=10)
print('Accuracy of Majority-class on SAE set is:', np.mean(scores_SAE))
print('Accuracy of Majority-class on AAE set is:', np.mean(scores_AAE))

Accuracy of Majority-class on SAE set is: 0.5
Accuracy of Majority-class on AAE set is: 0.5


## Parameter tuning (KNN)

In [8]:
dev_sets = [f_dev_tfidf, f_dev_emb, f_dev_mpnet]  # different data representations
metrics = ['cosine','euclidean', 'manhattan']
for s in range(len(dev_sets)):
    with open(dev_sets[s],'rb') as f:
        table = pickle.load(f)
    table
    X_dev = np.stack(table.TFIDF)
    y_dev = np.stack(table.Demographic)
    
    for m in metrics:
        acc = []
        for k in range(1,10):
            neigh = KNeighborsClassifier(n_neighbors=k, metric=m)

            scores = cross_val_score(neigh, X_dev, y_dev, cv=10)
            #print('k =',k,'accuracy:',np.mean(scores))
            acc.append(np.mean(scores))
        max_acc = max(acc)
        max_index = acc.index(max_acc)
        print('data set:', s, 'metric:', m, 'best k:', max_index, 'with accuracy:', max_acc)
        

data set: 0 metric: cosine best k: 8 with accuracy: 0.6935
data set: 0 metric: euclidean best k: 0 with accuracy: 0.6162500000000001
data set: 0 metric: manhattan best k: 0 with accuracy: 0.6060000000000001
data set: 1 metric: cosine best k: 8 with accuracy: 0.7082499999999999
data set: 1 metric: euclidean best k: 8 with accuracy: 0.7205
data set: 1 metric: manhattan best k: 8 with accuracy: 0.7255
data set: 2 metric: cosine best k: 8 with accuracy: 0.73125
data set: 2 metric: euclidean best k: 8 with accuracy: 0.73125
data set: 2 metric: manhattan best k: 8 with accuracy: 0.732


Choose k=8 with cosine distance metric

## Learn on whole demographic set - original

In [9]:
train_sets = [f_train_tfidf, f_train_emb, f_train_mpnet]  
dev_sets = [f_dev_tfidf, f_dev_emb, f_dev_mpnet]

In [10]:
def run_whole(clf):
    for n in range(len(train_sets)):
        with open(train_sets[n],'rb') as f:
            table = pickle.load(f) 
        X_train = np.stack(table.TFIDF)
        y_train = np.stack(table.Sentiment)

        with open(dev_sets[n],'rb') as f:
            table_dev = pickle.load(f) 
        X_dev = np.stack(table_dev.TFIDF)
        y_dev = np.stack(table_dev.Sentiment)

        grouped = table_dev.groupby(table_dev.Demographic)  # separate
        SAE = grouped.get_group("SAE")
        AAE = grouped.get_group("AAE")
        SAE = SAE.reset_index()
        AAE = AAE.reset_index()
        X_SAE = np.stack(SAE.TFIDF)
        y_SAE = np.stack(SAE.Sentiment)
        X_AAE = np.stack(AAE.TFIDF)
        y_AAE = np.stack(AAE.Sentiment)


        clf.fit(X_train, y_train)
        pred = clf.predict(X_dev)
        pred_SAE = clf.predict(X_SAE)
        pred_AAE = clf.predict(X_AAE)

        acc = accuracy_score(y_dev, pred)
        acc_SAE = accuracy_score(y_SAE, pred_SAE)
        acc_AAE = accuracy_score(y_AAE, pred_AAE)

        print('data set:',n , 'total accuray:', acc, 'SAE accuray:', acc_SAE, 'AAE accuracy:', acc_AAE)
        

### KNN

In [11]:
#create KNN classifier
clf = KNeighborsClassifier(n_neighbors=8, metric='cosine')
run_whole(clf)

data set: 0 total accuray: 0.6205 SAE accuray: 0.6455 AAE accuracy: 0.5955
data set: 1 total accuray: 0.6725 SAE accuray: 0.7 AAE accuracy: 0.645
data set: 2 total accuray: 0.6745 SAE accuray: 0.716 AAE accuracy: 0.633


### Naive Bayes

Assumptions:
1. conditional independence
    Not appropriate, (e.g. given sentiment=Negative, words 'angry' and 'fury' are clearly correlated.)
2. Distribution
    1. p(y) - bernuoli
    2. p(x|y) - Gaussian

In [12]:
#Create a Gaussian Classifier
clf = GaussianNB()
run_whole(clf)

data set: 0 total accuray: 0.64525 SAE accuray: 0.679 AAE accuracy: 0.6115
data set: 1 total accuray: 0.61475 SAE accuray: 0.652 AAE accuracy: 0.5775
data set: 2 total accuray: 0.61 SAE accuray: 0.65 AAE accuracy: 0.57


### Logistic Regression

In [13]:
#Create a Logistic Regression Classifier
clf = LogisticRegression(random_state=0, max_iter=1000)
run_whole(clf)

data set: 0 total accuray: 0.67875 SAE accuray: 0.7095 AAE accuracy: 0.648
data set: 1 total accuray: 0.69825 SAE accuray: 0.732 AAE accuracy: 0.6645
data set: 2 total accuray: 0.7205 SAE accuray: 0.7635 AAE accuracy: 0.6775


## Learn on separate data sets (Excluding new TFIDF)

In [14]:
def run_separate(clf):
    for n in range(len(train_sets)):
        with open(train_sets[n],'rb') as f:
            table = pickle.load(f)
        grouped = table.groupby(table.Demographic)
        SAE = grouped.get_group("SAE")
        AAE = grouped.get_group("AAE")

        X_SAE = np.stack(SAE.TFIDF)
        y_SAE = np.stack(SAE.Sentiment)

        scores_SAE = cross_val_score(clf, X_SAE, y_SAE, cv=10)
        acc_SAE = np.mean(scores_SAE)

        X_AAE = np.stack(AAE.TFIDF)
        y_AAE = np.stack(AAE.Sentiment)

        scores_AAE = cross_val_score(clf, X_AAE, y_AAE, cv=10)
        acc_AAE = np.mean(scores_AAE)    

        print('data set:',n , 'SAE accuray:', acc_SAE, 'AAE accuracy:', acc_AAE)


### KNN

In [15]:
# create KNN classifier
clf = KNeighborsClassifier(n_neighbors=8, metric='cosine')
run_separate(clf)

data set: 0 SAE accuray: 0.65295 AAE accuracy: 0.5955999999999999
data set: 1 SAE accuray: 0.702 AAE accuracy: 0.6266499999999999
data set: 2 SAE accuray: 0.7162 AAE accuracy: 0.6256


### Naive Bayes

In [16]:
#Create a Gaussian Classifier
clf = GaussianNB()
run_separate(clf)

data set: 0 SAE accuray: 0.64355 AAE accuracy: 0.59595
data set: 1 SAE accuray: 0.6583 AAE accuracy: 0.59235
data set: 2 SAE accuray: 0.6634 AAE accuracy: 0.58455


### Logistic Regression

In [17]:
#Create a Logistic Regression Classifier
clf = LogisticRegression(random_state=0, max_iter=1000)
run_separate(clf)

data set: 0 SAE accuray: 0.70685 AAE accuracy: 0.6393
data set: 1 SAE accuray: 0.7386 AAE accuracy: 0.6567500000000001
data set: 2 SAE accuray: 0.76825 AAE accuracy: 0.6768


## Learn on  new TFIDF (Separate)

#### Create separate TFIDF

In [18]:
raw = [f_dev_raw, f_train_raw, f_un_raw, f_test_raw]
set_name = ['dev', 'train', 'unlabeled', 'test']
table_list = []
for n in range(len(raw)):
    with open(raw[n],'rb') as f:
        df = pickle.load(f)
        df['Set'] = set_name[n]
        table_list.append(df)
s = pd.concat(table_list).reset_index()

grouped = s.groupby(s.Demographic)  # separate
SAE = grouped.get_group("SAE")
AAE = grouped.get_group("AAE")
SAE = SAE.reset_index()
AAE = AAE.reset_index()

In [19]:
vectorizer_SAE = TfidfVectorizer(stop_words = 'english', max_features=1000)
TFIDF_SAE = vectorizer_SAE.fit_transform(SAE.text)
TFIDF_SAE = TFIDF_SAE.toarray()
words_SAE = vectorizer_SAE.get_feature_names_out()

for i in range(TFIDF_SAE.shape[0]):
    SAE.text[i] = TFIDF_SAE[i]
SAE = SAE.rename({'text': 'TFIDF'}, axis=1)

SAE_dev = SAE.loc[(SAE['Set'] == 'dev'), ['TFIDF','Sentiment','Demographic']]
SAE_dev = SAE_dev.reset_index(drop=True)
SAE_train = SAE.loc[(SAE['Set'] == 'train'), ['TFIDF','Sentiment','Demographic']]
SAE_train = SAE_train.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SAE.text[i] = TFIDF_SAE[i]


In [20]:
vectorizer_AAE = TfidfVectorizer(stop_words = 'english', max_features=1000)
TFIDF_AAE = vectorizer_AAE.fit_transform(AAE.text)
TFIDF_AAE = TFIDF_AAE.toarray()
words_AAE = vectorizer_AAE.get_feature_names_out()

for i in range(TFIDF_AAE.shape[0]):
    AAE.text[i] = TFIDF_AAE[i]
AAE = AAE.rename({'text': 'TFIDF'}, axis=1)
AAE_dev = AAE.loc[(AAE['Set'] == 'dev'), ['TFIDF','Sentiment','Demographic']]
AAE_dev = AAE_dev.reset_index(drop=True)
AAE_train = AAE.loc[(AAE['Set'] == 'train'), ['TFIDF','Sentiment','Demographic']]
AAE_train = AAE_train.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  AAE.text[i] = TFIDF_AAE[i]


In [21]:
comm =  list(set(words_AAE).intersection(words_SAE))
print('overlapping:',len(comm))
AAE_uniq = [i for i in words_AAE if (not i in comm)]
SAE_uniq = [i for i in words_SAE if (not i in comm)]

overlapping: 595


In [22]:
def run_new_tfidf(clf):

    X_SAE = np.stack(SAE_train.TFIDF)
    y_SAE = np.stack(SAE_train.Sentiment)

    scores_SAE = cross_val_score(clf, X_SAE, y_SAE, cv=10)
    acc_SAE = np.mean(scores_SAE)

    X_AAE = np.stack(AAE_train.TFIDF)
    y_AAE = np.stack(AAE_train.Sentiment)

    scores_AAE = cross_val_score(clf, X_AAE, y_AAE, cv=10)
    acc_AAE = np.mean(scores_AAE)    

    print('SAE accuray:', acc_SAE, 'AAE accuracy:', acc_AAE)

#### KNN

In [23]:
# create KNN classifier
clf = KNeighborsClassifier(n_neighbors=8, metric='cosine')
run_new_tfidf(clf)

SAE accuray: 0.6596499999999998 AAE accuracy: 0.5982999999999999


#### Naive Bayes

In [24]:
#Create a Gaussian Classifier
clf = GaussianNB()
run_new_tfidf(clf)

SAE accuray: 0.675 AAE accuracy: 0.61175


### Logistic Regression

In [25]:
#Create a Logistic Regression Classifier
clf = LogisticRegression(random_state=0, max_iter=1000)
run_new_tfidf(clf)

SAE accuray: 0.7091500000000001 AAE accuracy: 0.6403000000000001


### Semi-supervised (Self-training)

In [26]:
from sklearn.semi_supervised import SelfTrainingClassifier
clf = LogisticRegression(random_state=0, max_iter=1000)
st_clf = SelfTrainingClassifier(clf, threshold=0.85, criterion='threshold', max_iter=10, verbose=True)

In [27]:
with open(f_dev_emb,'rb') as f:
    table = pickle.load(f)
X = np.stack(table.TFIDF)
y = np.stack(table.Sentiment)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

with open(f_un_emb,'rb') as f:
    untable = pickle.load(f)

X_un = np.stack(untable.TFIDF)
y_un = np.full(untable.Sentiment.shape , -1)


X_comb = np.concatenate((X_train, X_un))
y_comb = np.concatenate((y_train, y_un), dtype=object)

st_clf.fit(X_comb, y_comb)
st_clf.score(X_test, y_test)

End of iteration 1, added 20533 new labels.
End of iteration 2, added 34209 new labels.
End of iteration 3, added 17975 new labels.
End of iteration 4, added 6487 new labels.
End of iteration 5, added 2477 new labels.
End of iteration 6, added 1214 new labels.
End of iteration 7, added 694 new labels.
End of iteration 8, added 368 new labels.
End of iteration 9, added 206 new labels.
End of iteration 10, added 121 new labels.


0.66625

### Predict dialect group to select candidate data for semi-supervised learning


In [28]:
with open(f_train_mpnet,'rb') as f:
    table = pickle.load(f) 
X_train = np.stack(table.TFIDF)
y_train = np.stack(table.Demographic)


with open(f_un_mpnet,'rb') as f:
    table = pickle.load(f) 
X_test = np.stack(table.TFIDF)
y_test = np.stack(table.Demographic)

clf = LogisticRegression(random_state=0, max_iter=1000)

clf.fit(X_train, y_train)
pp = clf.predict_proba(X_test)
prd = clf.predict(X_test)

In [29]:
l = pp.shape[0]
slc = []
for i in range(l):
    slc.append(pp[i,0] >= 0.85)

In [30]:
with open(f_un_mpnet,'rb') as f:
    table_mpnet = pickle.load(f) 
table_mpnet.Demographic[slc] = 'AAE'
table_mpnet.to_pickle(f_un_pred_AAE_mpnet)

In [31]:
with open(f_un_raw,'rb') as f:
    table_raw = pickle.load(f) 
table_raw.Demographic[slc] = 'AAE'
table_raw.to_pickle(f_un_pred_AAE_raw)
with open(f_un_pred_AAE_raw,'rb') as f:
    table_raw = pickle.load(f) 
table_raw[slc]

Unnamed: 0,text,Sentiment,Demographic
6,3:16 for mommy,,AAE
10,Kmsl Olivia got me pissy weak,,AAE
20,Good morning ppl ... Feeling kinda down this m...,,AAE
21,\ _TWITTER-ENTITY_ : Niggas mad ashit they don...,,AAE
25,Bitches be mad because they set they self up,,AAE
...,...,...,...
99968,But I miss Tweety I need To G See Her And My G...,,AAE
99971,Taj is killing me tonight,,AAE
99973,Left alone with big fat fatty . She was such a...,,AAE
99983,â _TWITTER-ENTITY_ : _TWITTER-ENTITY_ _TWITT...,,AAE


In [35]:
# create full AAE data

raw = [f_dev_raw, f_train_raw, f_un_raw, f_test_raw, f_un_pred_AAE_raw]
set_name = ['dev', 'train', 'unlabeled', 'test', 'pred']
table_list = []
for n in range(len(raw)):
    with open(raw[n],'rb') as f:
        df = pickle.load(f)
        df['Set'] = set_name[n]
        table_list.append(df)
s = pd.concat(table_list).reset_index()

grouped = s.groupby(s.Demographic)
SAE = grouped.get_group("SAE")
AAE = grouped.get_group("AAE")
SAE = SAE.reset_index()
AAE = AAE.reset_index()
vectorizer_AAE = TfidfVectorizer(stop_words = 'english', max_features=1000)
TFIDF_AAE = vectorizer_AAE.fit_transform(AAE.text)
TFIDF_AAE = TFIDF_AAE.toarray()
words_AAE = vectorizer_AAE.get_feature_names_out()
for i in range(TFIDF_AAE.shape[0]):
    AAE.text[i] = TFIDF_AAE[i]
AAE = AAE.rename({'text': 'TFIDF'}, axis=1)

AAE_dev = AAE.loc[(AAE['Set'] == 'dev'), ['TFIDF','Sentiment','Demographic']]
AAE_dev = AAE_dev.reset_index(drop=True)

AAE_train = AAE.loc[(AAE['Set'] == 'train'), ['TFIDF','Sentiment','Demographic']]
AAE_train = AAE_train.reset_index(drop=True)

AAE_unlabeled = AAE.loc[(AAE['Set'] == 'pred'), ['TFIDF','Sentiment','Demographic']]
AAE_unlabeled = AAE_unlabeled.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  AAE.text[i] = TFIDF_AAE[i]


In [36]:
#clf = GaussianNB()
clf = LogisticRegression(random_state=0, max_iter=1000)
#clf = KNeighborsClassifier(n_neighbors=51, weights = 'distance', metric='cosine')

X_dev = np.stack(AAE_dev.TFIDF)
y_dev = np.stack(AAE_dev.Sentiment)
scores = cross_val_score(clf, X_dev, y_dev, cv=10)
np.mean(scores)

0.6045

In [37]:
from sklearn.semi_supervised import SelfTrainingClassifier
clf = LogisticRegression(random_state=0, max_iter=1000)
st_clf = SelfTrainingClassifier(clf, threshold=0.85, criterion='threshold', max_iter=50, verbose=True)

X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, test_size=0.2, random_state=0)

X_un = np.stack(AAE_train.TFIDF)
y_un = np.full(AAE_train.Sentiment.shape , -1)


X_comb = np.concatenate((X_train, X_un))
y_comb = np.concatenate((y_train, y_un), dtype=object)

st_clf.fit(X_comb, y_comb)
st_clf.score(X_test, y_test)

End of iteration 1, added 211 new labels.
End of iteration 2, added 528 new labels.
End of iteration 3, added 673 new labels.
End of iteration 4, added 670 new labels.
End of iteration 5, added 634 new labels.
End of iteration 6, added 576 new labels.
End of iteration 7, added 328 new labels.
End of iteration 8, added 222 new labels.
End of iteration 9, added 177 new labels.
End of iteration 10, added 131 new labels.
End of iteration 11, added 88 new labels.
End of iteration 12, added 69 new labels.
End of iteration 13, added 26 new labels.
End of iteration 14, added 27 new labels.
End of iteration 15, added 21 new labels.
End of iteration 16, added 13 new labels.
End of iteration 17, added 12 new labels.
End of iteration 18, added 2 new labels.
End of iteration 19, added 1 new labels.


0.595