In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_predict, train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from collections import defaultdict
import re
import json
from sklearn.metrics import confusion_matrix
import pickle

tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
tag_map['AS'] = wn.ADJ_SAT

# filepath = "finalized_8K_accounts.csv"
# filepath = "UNLABELED_accounts_emojis_replaced.csv"
filepath = "Spill_Accounts_To_Be_Labeled.csv"
hand_label = "hand.label"
government = "gov"
academia = "acad"
tourBiz = "tourbiz"

df = pd.read_csv(filepath)

# df = df[((df[hand_label] == 'media') | (df[hand_label] == tourBiz) |(df[hand_label] == academia) | (df[hand_label] == government) | (
#        df[hand_label] == 'other'))]

df = df[['username', 'description']]  # keep only relevant columns

lemmatizer = WordNetLemmatizer()
words_not_changed = ['media']


def preprocessing(row):
    if str(row) == "nan":
        lemma = ""
    else:
        row = str(row).lower()
        row = word_tokenize(row)  # tokenize
        lemma = [lemmatizer.lemmatize(token, tag_map[tag[0]]) if token not in words_not_changed else token for
                 token, tag in pos_tag(row)]  # lemmatization, depending on part-of-speech
        lemma = ["" if re.search(r'\b[0-9]+\b\s*', lem) else lem for lem in lemma]  # removing
    return str(lemma)


df['description_lemmatized'] = df['description'].apply(preprocessing)

print(df)


             username                                        description  \
0       ChungSunPark4                                      attempt me!!!   
1        LucilaQuanti  Me gusta la gente con sentido del humor, alegr...   
2       patdefranchis  I love people with a large dose of humor & a r...   
3       saravastiares  Have courage & Be Kind. Where there is kindnes...   
4      TheShogunGamer  Video Game Extraordinaire 🎮 Polyamorous ❤ Disa...   
...               ...                                                ...   
3493   ChrisFischer07                                                NaN   
3494       ryantpa813  Eight One Three Sports\nBolts, Bucs, Rays, Soo...   
3495  TueNiteRockStar                                                NaN   
3496     purpletang99  i tweet my opinions only. yes there are except...   
3497       firefly909  Unapologetic bleeding heart liberal. Hate liar...   

                                 description_lemmatized  
0                      ['atte

In [42]:
# all the empty descriptions
print(df.shape)
print(df[df['description_lemmatized'] != ""].shape) 

(3498, 3)
(2908, 3)


In [43]:
# Remove all the empty descriptions
empty_rows = df[df['description_lemmatized'] == ""]
print(empty_rows.shape)
df = df[df['description_lemmatized'] != ""]
print(df.shape)
#df[hand_label]
#print(df.shape)
#df[df['description_lemmatized'] != ""].shape

(590, 3)
(2908, 3)


In [44]:
# Re-indexing the remaining observations
df = df.reset_index(drop=True)

In [145]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

# print(type(df[['description_lemmatized']]))
embeddings = model.encode(df['description'].tolist())

ImportError: cannot import name 'Tensor' from 'torch' (unknown location)

In [None]:
# filename = 'SVM_BOW_unweighted_enhanced_model.pickle'
filename = 'SVM_BERT_unweighted_enhanced_model_full(1, 2).pickle'
loaded_model = pickle.load(open(filename, 'rb'))

X_test = embeddings

bag_of_words_y_pred_test = loaded_model.predict(X_test)

bag_of_words_y_pred_test

pred_prob = loaded_model.predict_proba(X_test)



In [None]:
pred_prob
bag_of_words_y_pred_test
pd.concat([pd.DataFrame(bag_of_words_y_pred_test), pd.DataFrame(pred_prob)], axis=1)

In [None]:
pred_prob.shape

In [None]:
pred_prob_df = pd.DataFrame(pred_prob, columns = ['acad_prob','gov_prob','media_prob','other_prob', 'tourbiz_prob'])

bag_of_words_y_pred_test.size

df['hand.label_simplified'] = bag_of_words_y_pred_test
#df = df.drop(columns=['description_lemmatized'])
df1 = pd.concat([df, pred_prob_df], axis=1)
#df1 = pd.DataFrame(my_array, columns = ['acad_prob','gov_prob','media_prob','other_prob', 'tourbiz_prob'])

df1
df1.shape
#pred_prob_df.shape
#len(bag_of_words_y_pred_test)

In [151]:
df1

NameError: name 'df1' is not defined

In [48]:
df1.to_csv(r'SVM_BERT_unweighted_UNLABELED_PREDICTED_accounts_W_PROBABILITIES_emojis_unchanged.csv', index=False)

In [49]:
import pandas as pd
spill_labeled_accs = pd.read_csv("Spill_Labeled.csv")
#spill_labeled_accs

In [51]:
print(spill_labeled_accs.size)

print(spill_labeled_accs["hand.label_simplified"].value_counts())

spill_labeled_accs["hand.label_simplified"].value_counts("row")

26172
hand.label_simplified
other      2680
media       162
acad         62
tourbiz       3
gov           1
Name: count, dtype: int64


hand.label_simplified
other      0.921596
media      0.055708
acad       0.021320
tourbiz    0.001032
gov        0.000344
Name: proportion, dtype: float64

In [53]:
# getting the rows labeled as other
mask = spill_labeled_accs["hand.label_simplified"] == 'other'


row_list = []
# Check if any of the probabilities (except 'other_prob') are greater than 0.3
for index, row in spill_labeled_accs[mask].iterrows():
    for col in ['acad_prob', 'gov_prob', 'media_prob', 'other_prob', 'tourbiz_prob']:
        if col != 'other_prob' and row[col] > 0.3:
            # saving the columns to review
            row_list.append(row)
            # Update the prediction column to the column name where the probability is higher than 0.3
            #spill_labeled_accs.at[index, "hand.label_simplified"] = col.replace('_prob', '')
rows_for_review = pd.DataFrame(row_list)
rows_for_review = rows_for_review.drop_duplicates()
#print(rows_for_review)

rows_for_review.to_csv("Accounts_To_Relabel.csv")

In [55]:
print(spill_labeled_accs["hand.label_simplified"].value_counts())

spill_labeled_accs["hand.label_simplified"].value_counts("row")
spill_labeled_accs

hand.label_simplified
other      2680
media       162
acad         62
tourbiz       3
gov           1
Name: count, dtype: int64


Unnamed: 0,username,description,description_lemmatized,hand.label_simplified,acad_prob,gov_prob,media_prob,other_prob,tourbiz_prob
0,ChungSunPark4,attempt me!!!,"['attempt', 'me', '!', '!', '!']",other,0.000776,0.000603,0.004676,0.993783,0.000162
1,LucilaQuanti,"Me gusta la gente con sentido del humor, alegr...","['me', 'gusta', 'la', 'gente', 'con', 'sentido...",other,0.009851,0.001305,0.006343,0.981409,0.001092
2,patdefranchis,I love people with a large dose of humor & a r...,"['i', 'love', 'people', 'with', 'a', 'large', ...",other,0.003543,0.000746,0.004899,0.990260,0.000552
3,saravastiares,Have courage & Be Kind. Where there is kindnes...,"['have', 'courage', '&', 'be', 'kind', '.', 'w...",other,0.003940,0.000537,0.007693,0.985011,0.002819
4,TheShogunGamer,Video Game Extraordinaire 🎮 Polyamorous ❤ Disa...,"['video', 'game', 'extraordinaire', '🎮', 'poly...",other,0.000925,0.000598,0.003607,0.994229,0.000641
...,...,...,...,...,...,...,...,...,...
2903,meevans59,Retired Department of Defense civilian employe...,"['retired', 'department', 'of', 'defense', 'ci...",other,0.001988,0.004162,0.002987,0.990808,0.000055
2904,dvdhnz,"Techie by trade, recalcitrant by design, socia...","['techie', 'by', 'trade', ',', 'recalcitrant',...",other,0.015865,0.002104,0.001636,0.978123,0.002272
2905,ryantpa813,"Eight One Three Sports\nBolts, Bucs, Rays, Soo...","['eight', 'one', 'three', 'sport', 'bolt', ','...",other,0.000745,0.001396,0.033677,0.962982,0.001200
2906,purpletang99,i tweet my opinions only. yes there are except...,"['i', 'tweet', 'my', 'opinion', 'only', '.', '...",other,0.003356,0.000360,0.076140,0.919635,0.000509


In [57]:
## replacing some rows with their manually labeled values

relabeled = pd.read_csv("Final_Account_Relabeling.csv")
mask = spill_labeled_accs["username"].isin(relabeled["username"])
# Update columns in df1 with corresponding values from df2 where usernames match
for index, row in spill_labeled_accs[mask].iterrows():
    relabel_row = relabeled[relabeled['username'] == row['username']]
    spill_labeled_accs.loc[index, 'hand.label_simplified'] = relabel_row['hand.label_simplified'].values
spill_labeled_accs["hand.label_simplified"].value_counts()

hand.label_simplified
other      2606
media       162
acad         62
other        36
media        17
acad         14
gov           5
tourbiz       3
tourbiz       2
gov           1
Name: count, dtype: int64

In [59]:
spill_accs_merged = pd.merge(spill_labeled_accs, empty_rows, how="outer")

print(spill_accs_merged.shape)
print(spill_accs_merged)

(3498, 9)
             username                                        description  \
0             004nino  Retraité.Égalité,fraternité,justice,liberté,na...   
1              00a03d  Mom of one, grandma of two, sister of six, aun...   
2     05adamlover0129  LOVE Adam Lambert for life!!! Also love Miami ...   
3     0Plongstocking2  Not a traitor. Traveler. Liker of food. Orchid...   
4             0Thessa  💚... schönheit\nist die natur! die kunst ist u...   
...               ...                                                ...   
3493   zoeycarmicheal  The Univ. of Alabama Alumni. Former Gymnast an...   
3494           zpleat  Research at Media Matters for America. All twe...   
3495         zshahan3  Human (maybe), writer + chief editor + CEO @Cl...   
3496          zul1732  You can't see people for what they are when yo...   
3497        zyiteblog  E-Scootersworld is global Power transport reta...   

                                 description_lemmatized hand.label_simplified

In [61]:
# acccounts labeled during Red-Tide research
prev_accs = pd.read_csv("Final_Account_Labels_for_Dashboard.csv")
print(prev_accs)
accounts_merged = pd.concat([spill_accs_merged, prev_accs], ignore_index=True)

accounts_merged.to_csv("ALL_Labeled_Accounts_Spill&RedTide.csv")

              username                                        description  \
0                  CNN  It’s our job to #GoThere & tell the most diffi...   
1               NatGeo  Taking our understanding and awareness of the ...   
2              FoxNews  Follow America's #1 cable news network, delive...   
3       washingtonpost                         Democracy Dies in Darkness   
4                  ABC  The only official ABC News Twitter account. Do...   
...                ...                                                ...   
29128  EvergreenZephyr  Wichita, Kansas, United (sic) States. Parody a...   
29129         johntfox  Madeleine & Marin's Dad | Gin Enthusiast | Twe...   
29130         SeGreene  Cranky former nurse and current plant patholog...   
29131      CherylLasse  Passionate about the environment, science and ...   
29132          jen_pic  🚫socialism. Pay your debts, ALL OF THEM! Nothi...   

       Label Label.Type  
0      media       Hand  
1      media       Hand

In [63]:
oil = pd.read_csv("../spill_data/Cleaned_Files/C_All_Oil.csv")
sewage = pd.read_csv("../spill_data/Cleaned_Files/C_All_Sewage.csv")
industrial = pd.read_csv("../spill_data/Cleaned_Files/C_All_Industrial.csv")

oil_accs = accounts_merged[accounts_merged["username"].isin(oil["username"])]
print(oil_accs["hand.label_simplified"].value_counts())
print(oil_accs["hand.label_simplified"].value_counts("row"))

sewage_accs = accounts_merged[accounts_merged["username"].isin(sewage["username"])]
print(sewage_accs["hand.label_simplified"].value_counts())
print(sewage_accs["hand.label_simplified"].value_counts("row"))

industrial_accs = accounts_merged[accounts_merged["username"].isin(industrial["username"])]
print(industrial_accs["hand.label_simplified"].value_counts())
print(industrial_accs["hand.label_simplified"].value_counts("row"))

hand.label_simplified
other      247
media       31
acad        19
media        7
acad         3
other        2
gov          1
tourbiz      1
Name: count, dtype: int64
hand.label_simplified
other      0.794212
media      0.099678
acad       0.061093
media      0.022508
acad       0.009646
other      0.006431
gov        0.003215
tourbiz    0.003215
Name: proportion, dtype: float64
hand.label_simplified
other      207
media        9
acad         4
other        3
gov          3
media        2
tourbiz      1
gov          1
Name: count, dtype: int64
hand.label_simplified
other      0.900000
media      0.039130
acad       0.017391
other      0.013043
gov        0.013043
media      0.008696
tourbiz    0.004348
gov        0.004348
Name: proportion, dtype: float64
hand.label_simplified
other      2179
media       127
acad         39
other        31
acad         11
media         9
tourbiz       2
tourbiz       1
gov           1
gov           1
Name: count, dtype: int64
hand.label_simplified
othe

In [65]:
# tf-idf
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [67]:
# adding the account labels to the csv with tweets in order to accesss during tf-idf
oil_w_acc = pd.merge(oil, accounts_merged, on='username', how='left')
print(oil_w_acc)
industrial_w_acc = pd.merge(industrial, accounts_merged, on='username', how='left')
sewage_w_acc = pd.merge(sewage, accounts_merged, on='username', how='left')

     Unnamed: 0                                               text  \
0             1  Reposting @divyendhu:\nOil Spill on the Gulf o...   
1             2  RT @hihi0806: Reposting @divyendhu:\nOil Spill...   
2             3  RT @hihi0806: Reposting @divyendhu:\nOil Spill...   
3             4  RT @hihi0806: Reposting @divyendhu:\nOil Spill...   
4             5  RT @hihi0806: Reposting @divyendhu:\nOil Spill...   
..          ...                                                ...   
705         615  @1053SS I think this team shouldn’t even be ov...   
706           1  The governor is responsible, but those who cam...   
707           2  The governor is responsible, but those who cam...   
708           3  The governor is responsible, but those who cam...   
709           4  RT @DGRFlorida: The governor is responsible, b...   

     possibly_sensitive                   id            author_id  \
0                  True  1079362755352817665             95188073   
1                 Fal

In [382]:
# wordcloud analysis based on account type

#preprocessing
oil_w_acc['text_with_display_links'].fillna('', inplace=True)
industrial_w_acc['text_with_display_links'].fillna('', inplace=True)
sewage_w_acc['text_with_display_links'].fillna('', inplace=True)

In [77]:
# preprocessing for tf-idf: columns -> account type, word, n (number of accounts that mentioned that word)
oil_acad = oil_w_acc[oil_w_acc["hand.label_simplified"]=="acad"]
word_counts = oil_acad.groupby(['username', 'text_with_display_links']).size().reset_index(name='count')


In [79]:
word_counts['words'] = word_counts['text_with_display_links'].str.split()

# Exploding the words column to create rows for each word
word_counts = word_counts.explode('words')

# Drop duplicates of user and words to get unique user-word pairs
unique_user_word_pairs = word_counts[['username', 'words']].drop_duplicates()

# Count unique users per word
word_usage_count = unique_user_word_pairs.groupby('words')['username'].nunique().reset_index(name='count')

# Count how many unique words have been used by at least one user
unique_word_count = word_usage_count['words'].nunique()

In [81]:
unique_word_count

175

In [93]:
word_usage_count.sort_values(by='count', ascending=False)

Unnamed: 0,words,count
103,oil,22
151,spill,22
156,tampa,17
144,science,14
72,gulf,13
...,...,...
100,morning,1
44,ecology,1
46,ecosystems,1
48,ellen,1
