# Import Required Libraries

In [39]:
import pandas as pd
import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from spacy.lang.en.stop_words import STOP_WORDS
import string

In [24]:
nlp = spacy.load("en_core_web_sm")

In [25]:
analyzer = SentimentIntensityAnalyzer()

# Load Dataset

In [26]:
# Load the sentimentdataset.csv file into a pandas DataFrame
df = pd.read_csv('True.csv')
df.head()

Unnamed: 0,title,text,subject,date
0,Cambodian PM pulls back on threat to shut righ...,PHNOM PENH (Reuters) - Cambodian Prime Ministe...,worldnews,"December 2, 2017"
1,UK court agrees to extradite suspect in Italy ...,LONDON (Reuters) - A man accused of being part...,worldnews,"September 29, 2017"
2,U.S. asks Venezuela for access to detained Cit...,CARACAS (Reuters) - Washington has asked the g...,worldnews,"November 23, 2017"
3,U.S. to unveil revised self-driving car guidel...,WASHINGTON (Reuters) - President Donald Trump’...,politicsNews,"September 5, 2017"
4,Court allows Democratic states to defend Obama...,WASHINGTON (Reuters) - A U.S. appeals court on...,politicsNews,"August 1, 2017"


# Preprocess dataset

In [27]:
# Convert the 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Create a new column 'Year' that extracts the year from the 'date' column
df['Year'] = df['date'].dt.year
df.head()

Unnamed: 0,title,text,subject,date,Year
0,Cambodian PM pulls back on threat to shut righ...,PHNOM PENH (Reuters) - Cambodian Prime Ministe...,worldnews,2017-12-02,2017
1,UK court agrees to extradite suspect in Italy ...,LONDON (Reuters) - A man accused of being part...,worldnews,2017-09-29,2017
2,U.S. asks Venezuela for access to detained Cit...,CARACAS (Reuters) - Washington has asked the g...,worldnews,2017-11-23,2017
3,U.S. to unveil revised self-driving car guidel...,WASHINGTON (Reuters) - President Donald Trump’...,politicsNews,2017-09-05,2017
4,Court allows Democratic states to defend Obama...,WASHINGTON (Reuters) - A U.S. appeals court on...,politicsNews,2017-08-01,2017


In [28]:
# Print the different values in the 'subject' column and their frequency
subject_counts = df['subject'].value_counts()
print(subject_counts)

subject
politicsNews    5307
worldnews       4693
Name: count, dtype: int64


In [29]:
# Function to extract important words from text
def extract_important_words(text):
    doc = nlp(text)
    important_words = [token.text for token in doc if token.pos_ in ['NOUN', 'VERB', 'ADJ']]
    return ';'.join(important_words)

# Apply the function to the 'text' column
df['important_words'] = df['text'].apply(extract_important_words)
df.head()

Unnamed: 0,title,text,subject,date,Year,important_words
0,Cambodian PM pulls back on threat to shut righ...,PHNOM PENH (Reuters) - Cambodian Prime Ministe...,worldnews,2017-12-02,2017,Cambodian;pulled;threat;close;human;rights;gro...
1,UK court agrees to extradite suspect in Italy ...,LONDON (Reuters) - A man accused of being part...,worldnews,2017-09-29,2017,man;accused;part;plot;kidnap;British;model;ext...
2,U.S. asks Venezuela for access to detained Cit...,CARACAS (Reuters) - Washington has asked the g...,worldnews,2017-11-23,2017,asked;government;leftist;access;Venezuelan;Ame...
3,U.S. to unveil revised self-driving car guidel...,WASHINGTON (Reuters) - President Donald Trump’...,politicsNews,2017-09-05,2017,administration;set;unveil;revised;self;driving...
4,Court allows Democratic states to defend Obama...,WASHINGTON (Reuters) - A U.S. appeals court on...,politicsNews,2017-08-01,2017,appeals;court;allowed;Democratic;state;attorne...


In [30]:
# Function to calculate normalized sentiment score
def calculate_sentiment(text):
    sentiment = analyzer.polarity_scores(text)
    normalized_score = (sentiment['compound'] + 1) / 2  # Normalize to range [0, 1]
    return normalized_score

# Apply the function to the 'text' column
df['sentiment_score'] = df['text'].apply(calculate_sentiment)
df.head()

Unnamed: 0,title,text,subject,date,Year,important_words,sentiment_score
0,Cambodian PM pulls back on threat to shut righ...,PHNOM PENH (Reuters) - Cambodian Prime Ministe...,worldnews,2017-12-02,2017,Cambodian;pulled;threat;close;human;rights;gro...,0.9904
1,UK court agrees to extradite suspect in Italy ...,LONDON (Reuters) - A man accused of being part...,worldnews,2017-09-29,2017,man;accused;part;plot;kidnap;British;model;ext...,0.063
2,U.S. asks Venezuela for access to detained Cit...,CARACAS (Reuters) - Washington has asked the g...,worldnews,2017-11-23,2017,asked;government;leftist;access;Venezuelan;Ame...,0.00585
3,U.S. to unveil revised self-driving car guidel...,WASHINGTON (Reuters) - President Donald Trump’...,politicsNews,2017-09-05,2017,administration;set;unveil;revised;self;driving...,0.9261
4,Court allows Democratic states to defend Obama...,WASHINGTON (Reuters) - A U.S. appeals court on...,politicsNews,2017-08-01,2017,appeals;court;allowed;Democratic;state;attorne...,0.0876


In [31]:
# Function to classify sentiment
def classify_sentiment(score):
    if score < 0.4:
        return 'Negative'
    elif 0.4 <= score <= 0.6:
        return 'Neutral'
    else:
        return 'Positive'

# Apply the function to the 'sentiment_score' column
df['sentiment'] = df['sentiment_score'].apply(classify_sentiment)
df.head()

Unnamed: 0,title,text,subject,date,Year,important_words,sentiment_score,sentiment
0,Cambodian PM pulls back on threat to shut righ...,PHNOM PENH (Reuters) - Cambodian Prime Ministe...,worldnews,2017-12-02,2017,Cambodian;pulled;threat;close;human;rights;gro...,0.9904,Positive
1,UK court agrees to extradite suspect in Italy ...,LONDON (Reuters) - A man accused of being part...,worldnews,2017-09-29,2017,man;accused;part;plot;kidnap;British;model;ext...,0.063,Negative
2,U.S. asks Venezuela for access to detained Cit...,CARACAS (Reuters) - Washington has asked the g...,worldnews,2017-11-23,2017,asked;government;leftist;access;Venezuelan;Ame...,0.00585,Negative
3,U.S. to unveil revised self-driving car guidel...,WASHINGTON (Reuters) - President Donald Trump’...,politicsNews,2017-09-05,2017,administration;set;unveil;revised;self;driving...,0.9261,Positive
4,Court allows Democratic states to defend Obama...,WASHINGTON (Reuters) - A U.S. appeals court on...,politicsNews,2017-08-01,2017,appeals;court;allowed;Democratic;state;attorne...,0.0876,Negative


In [32]:
# Drop the specified columns
df = df.drop(columns=['text', 'date', 'subject', 'sentiment'])
df.head()

Unnamed: 0,title,Year,important_words,sentiment_score
0,Cambodian PM pulls back on threat to shut righ...,2017,Cambodian;pulled;threat;close;human;rights;gro...,0.9904
1,UK court agrees to extradite suspect in Italy ...,2017,man;accused;part;plot;kidnap;British;model;ext...,0.063
2,U.S. asks Venezuela for access to detained Cit...,2017,asked;government;leftist;access;Venezuelan;Ame...,0.00585
3,U.S. to unveil revised self-driving car guidel...,2017,administration;set;unveil;revised;self;driving...,0.9261
4,Court allows Democratic states to defend Obama...,2017,appeals;court;allowed;Democratic;state;attorne...,0.0876


In [33]:
df1 = df

In [34]:
# Split the 'important_words' column into separate columns whenever ';' appears
important_words_split = df['important_words'].str.split(';', expand=True)

# Rename the columns to word1, word2, ..., wordn
important_words_split.columns = [f'word{i+1}' for i in range(important_words_split.shape[1])]

# Concatenate the original DataFrame with the new split columns
df = pd.concat([df.drop(columns=['important_words']), important_words_split], axis=1)
df.head()

Unnamed: 0,title,Year,sentiment_score,word1,word2,word3,word4,word5,word6,word7,...,word1312,word1313,word1314,word1315,word1316,word1317,word1318,word1319,word1320,word1321
0,Cambodian PM pulls back on threat to shut righ...,2017,0.9904,Cambodian,pulled,threat,close,human,rights,group,...,,,,,,,,,,
1,UK court agrees to extradite suspect in Italy ...,2017,0.063,man,accused,part,plot,kidnap,British,model,...,,,,,,,,,,
2,U.S. asks Venezuela for access to detained Cit...,2017,0.00585,asked,government,leftist,access,Venezuelan,American,executives,...,,,,,,,,,,
3,U.S. to unveil revised self-driving car guidel...,2017,0.9261,administration,set,unveil,revised,self,driving,vehicle,...,,,,,,,,,,
4,Court allows Democratic states to defend Obama...,2017,0.0876,appeals,court,allowed,Democratic,state,attorneys,general,...,,,,,,,,,,


In [None]:
# Melt the dataset so that each row corresponds to a word
df_melted = df.melt(id_vars=[col for col in df.columns if not col.startswith('word')],
                    value_vars=[col for col in df.columns if col.startswith('word')],
                    var_name='word_number', value_name='word')

# Drop rows with NaN values in the 'word' column
df_melted = df_melted.dropna(subset=['word'])

In [36]:
df_melted.head()

Unnamed: 0,title,Year,sentiment_score,word_number,word
0,Cambodian PM pulls back on threat to shut righ...,2017,0.9904,word1,Cambodian
1,UK court agrees to extradite suspect in Italy ...,2017,0.063,word1,man
2,U.S. asks Venezuela for access to detained Cit...,2017,0.00585,word1,asked
3,U.S. to unveil revised self-driving car guidel...,2017,0.9261,word1,administration
4,Court allows Democratic states to defend Obama...,2017,0.0876,word1,appeals


In [37]:
df_melted = df_melted.drop(columns=['word_number'])

In [38]:
# Aggregate each word by year
aggregated_df = df_melted.groupby(['Year', 'word']).agg(
    average_sentiment=('sentiment_score', 'mean'),
    frequency=('word', 'size')
).reset_index()
aggregated_df.head()

Unnamed: 0,Year,word,average_sentiment,frequency
0,2016,"""RECOMMENDED",0.03005,1
1,2016,#,0.990925,2
2,2016,%,0.9945,1
3,2016,*,0.956217,3
4,2016,+,0.9945,1


In [None]:
aggregated_df1 = aggregated_df

In [40]:
# Function to check if a word is a stopword or a symbol
def is_valid_word(word):
    return word not in STOP_WORDS and word not in string.punctuation

# Filter out symbols and stopwords from the 'word' column
aggregated_df = aggregated_df[aggregated_df['word'].apply(is_valid_word)]

# Rename the columns
aggregated_df = aggregated_df.rename(columns={
    'Year': 'year',
    'word': 'text',
    'average_sentiment': 'sentiment'
})

# Create a new column 'source' with the string 'true' for all rows
aggregated_df['source'] = 'true'
aggregated_df.head()

Unnamed: 0,year,text,sentiment,frequency,source
0,2016,"""RECOMMENDED",0.03005,1,True
6,2016,-Elimination,0.9949,1,True
7,2016,-H.,0.99875,1,True
8,2016,-annual,0.9949,1,True
9,2016,-government,0.6652,1,True


In [41]:
# # Save the preprocessed DataFrame to a new CSV file
# Export the DataFrame to a new .tsv file
aggregated_df.to_csv('true_sentiment.tsv', sep='\t', index=False)

In [None]:
###

In [None]:
###

In [None]:
###

In [12]:
# Drop the columns 'text', 'date', and 'sentiment_score'
df = df.drop(columns=['text', 'date', 'sentiment_score'])
df.head()

Unnamed: 0,title,subject,Year,important_words,sentiment
0,Cambodian PM pulls back on threat to shut righ...,worldnews,2017,Cambodian;pulled;threat;close;human;rights;gro...,Positive
1,UK court agrees to extradite suspect in Italy ...,worldnews,2017,man;accused;part;plot;kidnap;British;model;ext...,Negative
2,U.S. asks Venezuela for access to detained Cit...,worldnews,2017,asked;government;leftist;access;Venezuelan;Ame...,Negative
3,U.S. to unveil revised self-driving car guidel...,politicsNews,2017,administration;set;unveil;revised;self;driving...,Positive
4,Court allows Democratic states to defend Obama...,politicsNews,2017,appeals;court;allowed;Democratic;state;attorne...,Negative


In [15]:
# Split the 'important_words' column into separate columns whenever ';' appears
important_words_split = df['important_words'].str.split(';', expand=True)

# Rename the columns to word1, word2, ..., wordn
important_words_split.columns = [f'word{i+1}' for i in range(important_words_split.shape[1])]

# Concatenate the original DataFrame with the new split columns
df = pd.concat([df.drop(columns=['important_words']), important_words_split], axis=1)

# Display the first few rows to verify the changes
df.head()

Unnamed: 0,title,subject,Year,sentiment,word1,word2,word3,word4,word5,word6,...,word1312,word1313,word1314,word1315,word1316,word1317,word1318,word1319,word1320,word1321
0,Cambodian PM pulls back on threat to shut righ...,worldnews,2017,Positive,Cambodian,pulled,threat,close,human,rights,...,,,,,,,,,,
1,UK court agrees to extradite suspect in Italy ...,worldnews,2017,Negative,man,accused,part,plot,kidnap,British,...,,,,,,,,,,
2,U.S. asks Venezuela for access to detained Cit...,worldnews,2017,Negative,asked,government,leftist,access,Venezuelan,American,...,,,,,,,,,,
3,U.S. to unveil revised self-driving car guidel...,politicsNews,2017,Positive,administration,set,unveil,revised,self,driving,...,,,,,,,,,,
4,Court allows Democratic states to defend Obama...,politicsNews,2017,Negative,appeals,court,allowed,Democratic,state,attorneys,...,,,,,,,,,,


In [16]:
# Melt the dataset so that each row corresponds to a word
melted_df = pd.melt(df, id_vars=['title', 'subject', 'Year', 'sentiment'], 
                    value_vars=[col for col in df.columns if col.startswith('word')],
                    var_name='word_number', value_name='word')

# Drop rows where 'word' is None
melted_df = melted_df.dropna(subset=['word'])

# Display the first few rows to verify the changes
melted_df.head()

Unnamed: 0,title,subject,Year,sentiment,word_number,word
0,Cambodian PM pulls back on threat to shut righ...,worldnews,2017,Positive,word1,Cambodian
1,UK court agrees to extradite suspect in Italy ...,worldnews,2017,Negative,word1,man
2,U.S. asks Venezuela for access to detained Cit...,worldnews,2017,Negative,word1,asked
3,U.S. to unveil revised self-driving car guidel...,politicsNews,2017,Positive,word1,administration
4,Court allows Democratic states to defend Obama...,politicsNews,2017,Negative,word1,appeals


In [17]:
# Rename columns
melted_df1 = melted_df.rename(columns={
    'title': 'Title',
    'subject': 'Conference',
    'word': 'Author Names'
})

# Drop the 'Hashtag_Type' column as it is no longer needed
melted_df1 = melted_df1.drop(columns=['sentiment', 'word_number'])
melted_df1.head()

Unnamed: 0,Title,Conference,Year,Author Names
0,Cambodian PM pulls back on threat to shut righ...,worldnews,2017,Cambodian
1,UK court agrees to extradite suspect in Italy ...,worldnews,2017,man
2,U.S. asks Venezuela for access to detained Cit...,worldnews,2017,asked
3,U.S. to unveil revised self-driving car guidel...,politicsNews,2017,administration
4,Court allows Democratic states to defend Obama...,politicsNews,2017,appeals


In [21]:
# # Save the preprocessed DataFrame to a new CSV file
# Export the DataFrame to a new .tsv file
melted_df1.to_csv('true_Keywords.tsv', sep='\t', index=False)

In [19]:
# Rename columns
melted_df2 = melted_df.rename(columns={
    'title': 'Title',
    'sentiment': 'Conference',
    'word': 'Author Names'
})

# Drop the 'Hashtag_Type' column as it is no longer needed
melted_df2 = melted_df2.drop(columns=['subject', 'word_number'])
melted_df2.head()

Unnamed: 0,Title,Year,Conference,Author Names
0,Cambodian PM pulls back on threat to shut righ...,2017,Positive,Cambodian
1,UK court agrees to extradite suspect in Italy ...,2017,Negative,man
2,U.S. asks Venezuela for access to detained Cit...,2017,Negative,asked
3,U.S. to unveil revised self-driving car guidel...,2017,Positive,administration
4,Court allows Democratic states to defend Obama...,2017,Negative,appeals


In [22]:
# # Save the preprocessed DataFrame to a new CSV file
# Export the DataFrame to a new .tsv file
melted_df2.to_csv('true_Sentiments.tsv', sep='\t', index=False)

In [None]:
###

In [None]:
###

In [None]:
###

# Melt

# Save Preprocessed dataset

In [13]:
# Rename columns
df_melted2 = df_melted.rename(columns={
    'Text': 'Title',
    'Platform': 'Conference',
    'year': 'Year',
    'Hashtag': 'Author Names'
})

# Drop the 'Hashtag_Type' column as it is no longer needed
df_melted2 = df_melted2.drop(columns=['SentimentCategory'])

# Display the first few rows of the transformed DataFrame to verify the changes
df_melted2.head()

Unnamed: 0,Title,Conference,Year,Author Names
0,Enjoying a beautiful day at the park! ...,Twitter,2023,Nature
1,Traffic was terrible this morning. ...,Twitter,2023,Traffic
2,Just finished an amazing workout! 💪 ...,Instagram,2023,Fitness
3,Excited about the upcoming weekend getaway! ...,Facebook,2023,Travel
4,Trying out a new recipe for dinner tonight. ...,Instagram,2023,Cooking


In [15]:
# # Save the preprocessed DataFrame to a new CSV file
# Export the DataFrame to a new .tsv file
df_melted2.to_csv('SocialMedia.tsv', sep='\t', index=False)