In [2]:
import pandas as pd

In [3]:
# Goal: Read our dataset, and seperate based on labels. 
# When user gives tweet, we compare each word in the tweet
# to the corresponding dataframe and see what words are that dataframe
# In order to make sure we are not including non-important words, preprocess is necessary

In [11]:
# Read Dataset
dataset_df = pd.read_csv("../Dataset/Preprocessed_Data_Added_More.csv")
print(dataset_df["stance"].unique())
dataset_df.head(5)

['believer' 'denier' 'neutral']


Unnamed: 0,id,stance,Tweet
0,1.05e+18,believer,unenvironment never able tackle climate change...
1,1.07e+18,believer,oxfam many people need die drought induced hun...
2,9.92e+17,believer,pakistani city breaks april record day 50c hea...
3,1.13e+18,believer,meet christopher lee ceo climate kic aus speak...
4,1.12e+18,believer,mariansmedley would like see every article rep...


In [22]:
# Split Dataframe into the corresponding label
believer_df = pd.DataFrame(dataset_df[dataset_df["stance"] == "believer"]["Tweet"])
# believer_df.head(5)
denier_df = pd.DataFrame(dataset_df[dataset_df["stance"] == "denier"]["Tweet"])
neutral_df = pd.DataFrame(dataset_df[dataset_df["stance"] == "neutral"]["Tweet"])

# Make sure the split is done correctly
print(len(believer_df) + len(denier_df) + len(neutral_df) == len(dataset_df))
believer_df.head(5)

True


Unnamed: 0,Tweet
0,unenvironment never able tackle climate change...
1,oxfam many people need die drought induced hun...
2,pakistani city breaks april record day 50c hea...
3,meet christopher lee ceo climate kic aus speak...
4,mariansmedley would like see every article rep...


In [32]:
# https://towardsdatascience.com/check-for-a-substring-in-a-pandas-dataframe-column-4b949f64852
# Using Contains we can use the length > 0 to know if the word is in the dataset
print(len(believer_df.loc[believer_df['Tweet'].str.contains("unenvironment", case=False)]))
print(len(believer_df.loc[believer_df['Tweet'].str.contains("unenviXronment", case=False)]))

36
0


In [45]:
# Since this is from dataset every word should be found
test_string = believer_df.iloc[0:1,0:1].values[0][0]
print(test_string)
keywords = []
for word in test_string.split(" "):
    if len(believer_df.loc[believer_df['Tweet'].str.contains(word, case=False)]) > 0:
        keywords.append(word)
keywords

unenvironment never able tackle climate change without bringing climate culture india taki


['unenvironment',
 'never',
 'able',
 'tackle',
 'climate',
 'change',
 'without',
 'bringing',
 'climate',
 'culture',
 'india',
 'taki']

In [90]:
# Test String from the Live Demo CSV
test_string = """recovery economic demographic military debt inflation push humans nature wipeout climate change fallout natural 
              disasters wars resources migration amp blame bankruptcies closures russia china amp covid humans poverty humans alive 70yrs ago"""
# print(test_string)
keywords = []
for word in test_string.split(" "):
    if len(believer_df.loc[believer_df['Tweet'].str.contains(word, case=False)]) > 0:
        keywords.append(word)
# Remove this: '' that appeared in the list
keywords = [x for x in keywords if len(x.strip()) > 0]
keywords

['recovery',
 'economic',
 'demographic',
 'military',
 'debt',
 'inflation',
 'push',
 'humans',
 'nature',
 'wipeout',
 'climate',
 'change',
 'fallout',
 'natural',
 'disasters',
 'wars',
 'resources',
 'migration',
 'amp',
 'blame',
 'closures',
 'russia',
 'china',
 'amp',
 'humans',
 'poverty',
 'humans',
 'alive',
 'ago']

In [57]:
# Compare the length of keywords to orginal string to see how much keywords were found
print("Keywords: " + str(len(" ".join(keywords))))
print("Original Word: " + str(len(test_string)))

Keywords: 214
Original Word: 254


In [98]:
# Comare word with dataframe & return the common words
def extract_Keywords(tweet, df):
    keywords = []
    for word in tweet.split(" "):
        if len(df.loc[df['Tweet'].str.contains(word, case=False)]) > 0:
            keywords.append(word)
    # Remove this: '' that appeared in the list
    keywords = [x for x in keywords if len(x.strip()) > 0]
    return keywords

# Get tweet and stancetype to send the correct dataframe to extract_Keywords
def getKeywords_fucntion(tweet, stanceType):
    keywords = []
    if stanceType == "believer" or stanceType == 2:
        keywords = extract_Keywords(tweet, believer_df)
    elif stanceType == "denier" or stanceType == 1:
        keywords = extract_Keywords(tweet, denier_df)
    elif stanceType == "neutral" or stanceType == 0:
        keywords = extract_Keywords(tweet, neutral_df)
    return keywords

In [83]:
# Example Call for multiple tweets
# Note: output.csv is the download file from the server.py
test_df = pd.read_csv("../LiveDemoDataset/output.csv")
test_df["Keywords"] = test_df.apply(lambda x: getKeywords_fucntion(x['Tweet'], x['stance']), axis=1)
test_df


Unnamed: 0.1,Unnamed: 0,ID,Date,Tweet,stance,Keywords
0,0,1.64332e+18,2023-04-04 18:27:48+00:00,recovery economic demographic military debt in...,believer,"[recovery, economic, demographic, military, de..."
1,1,1.64332e+18,2023-04-04 18:27:40+00:00,proudelephantus hellonheels2020 beach mansion ...,neutral,"[beach, mansion, climate, change, hypocrites, ..."
2,2,1.64332e+18,2023-04-04 18:27:36+00:00,recovery economic demographic military debt in...,believer,"[recovery, economic, demographic, military, de..."
3,3,1.64332e+18,2023-04-04 18:27:31+00:00,evelyn von warnitz added logline titled amazin...,denier,"[evelyn, von, added, titled, amazing, life, co..."
4,4,1.64332e+18,2023-04-04 18:27:22+00:00,royalaviaire reversed climate change,denier,"[reversed, climate, change]"


In [97]:
# Example Call for single tweet
test_string = """recovery economic demographic military debt inflation push humans nature wipeout climate change fallout natural 
              disasters wars resources migration amp blame bankruptcies closures russia china amp covid humans poverty humans alive 70yrs ago"""
getKeywords_fucntion(test_string, 2)

recovery economic demographic military debt inflation push humans nature wipeout climate change fallout natural 
              disasters wars resources migration amp blame bankruptcies closures russia china amp covid humans poverty humans alive 70yrs ago


['recovery',
 'economic',
 'demographic',
 'military',
 'debt',
 'inflation',
 'push',
 'humans',
 'nature',
 'wipeout',
 'climate',
 'change',
 'fallout',
 'natural',
 'disasters',
 'wars',
 'resources',
 'migration',
 'amp',
 'blame',
 'closures',
 'russia',
 'china',
 'amp',
 'humans',
 'poverty',
 'humans',
 'alive',
 'ago']