In [61]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_predict, train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from collections import defaultdict
import re
import json
from sklearn.metrics import confusion_matrix
import pickle

tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
tag_map['AS'] = wn.ADJ_SAT

# filepath = "finalized_8K_accounts.csv"
# filepath = "UNLABELED_accounts_emojis_replaced.csv"
filepath = "Spill_Accounts_To_Be_Labeled.csv"
hand_label = "hand.label"
government = "gov"
academia = "acad"
tourBiz = "tourbiz"

df = pd.read_csv(filepath)

# df = df[((df[hand_label] == 'media') | (df[hand_label] == tourBiz) |(df[hand_label] == academia) | (df[hand_label] == government) | (
#        df[hand_label] == 'other'))]

df = df[['username', 'description']]  # keep only relevant columns

lemmatizer = WordNetLemmatizer()
words_not_changed = ['media']


def preprocessing(row):
    if str(row) == "nan":
        lemma = ""
    else:
        row = str(row).lower()
        row = word_tokenize(row)  # tokenize
        lemma = [lemmatizer.lemmatize(token, tag_map[tag[0]]) if token not in words_not_changed else token for
                 token, tag in pos_tag(row)]  # lemmatization, depending on part-of-speech
        lemma = ["" if re.search(r'\b[0-9]+\b\s*', lem) else lem for lem in lemma]  # removing
    return str(lemma)


df['description_lemmatized'] = df['description'].apply(preprocessing)

print(df)


             username                                        description  \
0       ChungSunPark4                                      attempt me!!!   
1        LucilaQuanti  Me gusta la gente con sentido del humor, alegr...   
2       patdefranchis  I love people with a large dose of humor & a r...   
3       saravastiares  Have courage & Be Kind. Where there is kindnes...   
4      TheShogunGamer  Video Game Extraordinaire 🎮 Polyamorous ❤ Disa...   
...               ...                                                ...   
3493   ChrisFischer07                                                NaN   
3494       ryantpa813  Eight One Three Sports\nBolts, Bucs, Rays, Soo...   
3495  TueNiteRockStar                                                NaN   
3496     purpletang99  i tweet my opinions only. yes there are except...   
3497       firefly909  Unapologetic bleeding heart liberal. Hate liar...   

                                 description_lemmatized  
0                      ['atte

In [62]:
# all the empty descriptions
print(df.shape)
print(df[df['description_lemmatized'] != ""].shape) 

(3498, 3)
(2908, 3)


In [63]:
# Remove all the empty descriptions
empty_rows = df[df['description_lemmatized'] == ""]
print(empty_rows.shape)
df = df[df['description_lemmatized'] != ""]
print(df.shape)
#df[hand_label]
#print(df.shape)
#df[df['description_lemmatized'] != ""].shape

(590, 3)
(2908, 3)


In [34]:
# Re-indexing the remaining observations
df = df.reset_index(drop=True)

In [1]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

# print(type(df[['description_lemmatized']]))
embeddings = model.encode(df['description'].tolist())

ImportError: cannot import name 'Tensor' from 'torch' (unknown location)

In [43]:
# filename = 'SVM_BOW_unweighted_enhanced_model.pickle'
filename = 'SVM_BERT_unweighted_enhanced_model_full(1, 2).pickle'
loaded_model = pickle.load(open(filename, 'rb'))

X_test = embeddings

bag_of_words_y_pred_test = loaded_model.predict(X_test)

bag_of_words_y_pred_test

pred_prob = loaded_model.predict_proba(X_test)



In [44]:
pred_prob
bag_of_words_y_pred_test
pd.concat([pd.DataFrame(bag_of_words_y_pred_test), pd.DataFrame(pred_prob)], axis=1)

Unnamed: 0,0,0.1,1,2,3,4
0,other,0.000585,0.001464,0.001155,0.992605,0.004191
1,other,0.004342,0.000639,0.003219,0.982458,0.009342
2,other,0.005846,0.015085,0.049567,0.928598,0.000904
3,other,0.001891,0.001057,0.002662,0.994019,0.000370
4,other,0.006474,0.001169,0.005799,0.983785,0.002772
...,...,...,...,...,...,...
13237,other,0.001823,0.010809,0.064314,0.785241,0.137813
13238,other,0.007914,0.001513,0.048221,0.941749,0.000602
13239,other,0.492738,0.001781,0.002790,0.502624,0.000067
13240,other,0.012996,0.000962,0.000605,0.984303,0.001134


In [45]:
pred_prob.shape

(13242, 5)

In [46]:
pred_prob_df = pd.DataFrame(pred_prob, columns = ['acad_prob','gov_prob','media_prob','other_prob', 'tourbiz_prob'])

bag_of_words_y_pred_test.size

df['hand.label_simplified'] = bag_of_words_y_pred_test
#df = df.drop(columns=['description_lemmatized'])
df1 = pd.concat([df, pred_prob_df], axis=1)
#df1 = pd.DataFrame(my_array, columns = ['acad_prob','gov_prob','media_prob','other_prob', 'tourbiz_prob'])

df1
df1.shape
#pred_prob_df.shape
#len(bag_of_words_y_pred_test)

(13242, 9)

In [47]:
df1

Unnamed: 0,username,description,description_lemmatized,hand.label_simplified,acad_prob,gov_prob,media_prob,other_prob,tourbiz_prob
0,LeChatNoire4,#VOTE BLUE 2022 🌊🇺🇸🌊 #BuyARepublicanToday! no ...,"['#', 'vote', 'blue', '', '🌊🇺🇸🌊', '#', 'buyare...",other,0.000585,0.001464,0.001155,0.992605,0.004191
1,SethPlatt,Creator Collector Cultivator Art Web3 ENS AI S...,"['creator', 'collector', 'cultivator', 'art', ...",other,0.004342,0.000639,0.003219,0.982458,0.009342
2,eco_voice,"A non-partisan, independent, volunteer run org...","['a', 'non-partisan', ',', 'independent', ',',...",other,0.005846,0.015085,0.049567,0.928598,0.000904
3,Corn4Harvick,*Flo-Grown* 🇺🇸 🇺🇸 Jesus sent me back to straig...,"['*', 'flo-grown', '*', '🇺🇸', '🇺🇸', 'jesus', '...",other,0.001891,0.001057,0.002662,0.994019,0.000370
4,memorabiliaddy,Healthcare Professional * Dad to Two * MSU Alu...,"['healthcare', 'professional', '*', 'dad', 'to...",other,0.006474,0.001169,0.005799,0.983785,0.002772
...,...,...,...,...,...,...,...,...,...
13237,EvergreenZephyr,"Wichita, Kansas, United (sic) States. Parody a...","['wichita', ',', 'kansa', ',', 'united', '(', ...",other,0.001823,0.010809,0.064314,0.785241,0.137813
13238,johntfox,Madeleine & Marin's Dad | Gin Enthusiast | Twe...,"['madeleine', '&', 'marin', ""'s"", 'dad', '|', ...",other,0.007914,0.001513,0.048221,0.941749,0.000602
13239,SeGreene,Cranky former nurse and current plant patholog...,"['cranky', 'former', 'nurse', 'and', 'current'...",other,0.492738,0.001781,0.002790,0.502624,0.000067
13240,CherylLasse,"Passionate about the environment, science and ...","['passionate', 'about', 'the', 'environment', ...",other,0.012996,0.000962,0.000605,0.984303,0.001134


In [48]:
df1.to_csv(r'SVM_BERT_unweighted_UNLABELED_PREDICTED_accounts_W_PROBABILITIES_emojis_unchanged.csv', index=False)

In [67]:
import pandas as pd
spill_labeled_accs = pd.read_csv("Spill_Labeled.csv")
#spill_labeled_accs

In [69]:
print(spill_labeled_accs.size)

print(spill_labeled_accs["hand.label_simplified"].value_counts())

spill_labeled_accs["hand.label_simplified"].value_counts("row")

26172
hand.label_simplified
other      2680
media       162
acad         62
tourbiz       3
gov           1
Name: count, dtype: int64


hand.label_simplified
other      0.921596
media      0.055708
acad       0.021320
tourbiz    0.001032
gov        0.000344
Name: proportion, dtype: float64

In [71]:
# getting the rows labeled as other
mask = spill_labeled_accs["hand.label_simplified"] == 'other'


row_list = []
# Check if any of the probabilities (except 'other_prob') are greater than 0.3
for index, row in spill_labeled_accs[mask].iterrows():
    for col in ['acad_prob', 'gov_prob', 'media_prob', 'other_prob', 'tourbiz_prob']:
        if col != 'other_prob' and row[col] > 0.3:
            # saving the columns to review
            row_list.append(row)
            # Update the prediction column to the column name where the probability is higher than 0.3
            #spill_labeled_accs.at[index, "hand.label_simplified"] = col.replace('_prob', '')
rows_for_review = pd.DataFrame(row_list)
rows_for_review = rows_for_review.drop_duplicates()
#print(rows_for_review)

rows_for_review.to_csv("Accounts_To_Relabel.csv")

In [73]:
print(spill_labeled_accs["hand.label_simplified"].value_counts())

spill_labeled_accs["hand.label_simplified"].value_counts("row")

hand.label_simplified
other      2680
media       162
acad         62
tourbiz       3
gov           1
Name: count, dtype: int64


hand.label_simplified
other      0.921596
media      0.055708
acad       0.021320
tourbiz    0.001032
gov        0.000344
Name: proportion, dtype: float64

In [75]:
print(spill_labeled_accs)

            username                                        description  \
0      ChungSunPark4                                      attempt me!!!   
1       LucilaQuanti  Me gusta la gente con sentido del humor, alegr...   
2      patdefranchis  I love people with a large dose of humor & a r...   
3      saravastiares  Have courage & Be Kind. Where there is kindnes...   
4     TheShogunGamer  Video Game Extraordinaire 🎮 Polyamorous ❤ Disa...   
...              ...                                                ...   
2903       meevans59  Retired Department of Defense civilian employe...   
2904          dvdhnz  Techie by trade, recalcitrant by design, socia...   
2905      ryantpa813  Eight One Three Sports\nBolts, Bucs, Rays, Soo...   
2906    purpletang99  i tweet my opinions only. yes there are except...   
2907      firefly909  Unapologetic bleeding heart liberal. Hate liar...   

                                 description_lemmatized hand.label_simplified  \
0                 

In [85]:
spill_accs_merged = pd.merge(spill_labeled_accs, empty_rows, how="outer")

print(spill_accs_merged.shape)
print(spill_accs_merged)

(3498, 9)
             username                                        description  \
0             004nino  Retraité.Égalité,fraternité,justice,liberté,na...   
1              00a03d  Mom of one, grandma of two, sister of six, aun...   
2     05adamlover0129  LOVE Adam Lambert for life!!! Also love Miami ...   
3     0Plongstocking2  Not a traitor. Traveler. Liker of food. Orchid...   
4             0Thessa  💚... schönheit\nist die natur! die kunst ist u...   
...               ...                                                ...   
3493   zoeycarmicheal  The Univ. of Alabama Alumni. Former Gymnast an...   
3494           zpleat  Research at Media Matters for America. All twe...   
3495         zshahan3  Human (maybe), writer + chief editor + CEO @Cl...   
3496          zul1732  You can't see people for what they are when yo...   
3497        zyiteblog  E-Scootersworld is global Power transport reta...   

                                 description_lemmatized hand.label_simplified

In [91]:
# acccounts labeled during Red-Tide research
prev_accs = pd.read_csv("Final_Account_Labels_for_Dashboard.csv")
print(prev_accs)
accounts_merged = pd.concat([spill_accs_merged, prev_accs], ignore_index=True)

accounts_merged.to_csv("ALL_Labeled_Accounts_Spill&RedTide.csv")

              username                                        description  \
0                  CNN  It’s our job to #GoThere & tell the most diffi...   
1               NatGeo  Taking our understanding and awareness of the ...   
2              FoxNews  Follow America's #1 cable news network, delive...   
3       washingtonpost                         Democracy Dies in Darkness   
4                  ABC  The only official ABC News Twitter account. Do...   
...                ...                                                ...   
29128  EvergreenZephyr  Wichita, Kansas, United (sic) States. Parody a...   
29129         johntfox  Madeleine & Marin's Dad | Gin Enthusiast | Twe...   
29130         SeGreene  Cranky former nurse and current plant patholog...   
29131      CherylLasse  Passionate about the environment, science and ...   
29132          jen_pic  🚫socialism. Pay your debts, ALL OF THEM! Nothi...   

       Label Label.Type  
0      media       Hand  
1      media       Hand

In [117]:
oil = pd.read_csv("../spill_data/All_OilSpill.csv")
sewage = pd.read_csv("../spill_data/All_SewageSpill.csv")
industrial = pd.read_csv("../spill_data/All_IndustrialSpill.csv")

oil_accs = accounts_merged[accounts_merged["username"].isin(oil["username"])]
print(oil_accs["hand.label_simplified"].value_counts())
print(oil_accs["hand.label_simplified"].value_counts("row"))

sewage_accs = accounts_merged[accounts_merged["username"].isin(sewage["username"])]
print(sewage_accs["hand.label_simplified"].value_counts())
print(sewage_accs["hand.label_simplified"].value_counts("row"))

industrial_accs = accounts_merged[accounts_merged["username"].isin(industrial["username"])]
print(industrial_accs["hand.label_simplified"].value_counts())
print(industrial_accs["hand.label_simplified"].value_counts("row"))

hand.label_simplified
other    261
media     31
acad      19
Name: count, dtype: int64
hand.label_simplified
other    0.839228
media    0.099678
acad     0.061093
Name: proportion, dtype: float64
hand.label_simplified
other      215
media        9
acad         4
tourbiz      1
gov          1
Name: count, dtype: int64
hand.label_simplified
other      0.934783
media      0.039130
acad       0.017391
tourbiz    0.004348
gov        0.004348
Name: proportion, dtype: float64
hand.label_simplified
other      2232
media       127
acad         39
tourbiz       2
gov           1
Name: count, dtype: int64
hand.label_simplified
other      0.929613
media      0.052895
acad       0.016243
tourbiz    0.000833
gov        0.000416
Name: proportion, dtype: float64
