In [2]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import numpy as np
import pandas as pd
from collections import Counter
import pandas as pd
import urlextract
from spellchecker import SpellChecker
import re
import emoji
import unidecode
from bs4 import BeautifulSoup
import urlextract

In [34]:
### CLEANING
df = pd.read_csv('C:/Users/david/OneDrive/Skrivebord/Reddit Analysis/MFRC.csv')
df = df.drop(['confidence', 'annotator'], axis=1)

# Split rows by posts that have numerous Moral Foundations attached to it, and save the data under a new name
df = df.assign(annotation=df['annotation'].str.split(','))
df = df.explode('annotation')
df.to_csv('C:/Users/david/OneDrive/Skrivebord/Reddit Analysis/MFRC_ind_anno.csv', index=False)

# create a new DataFrame to store the majority annotated posts
majority_annotated_posts = pd.DataFrame(columns=df.columns)

# iterate through each unique post
for unique_post in df['text'].unique():
    # create a temporary DataFrame to store the duplicate rows for the current unique post
    temp_df = df[df['text'] == unique_post].copy()
    
    # get the count of each annotation for the current post
    annotation_counts = temp_df['annotation'].value_counts()
    
    # get the highest count of annotations for the current post
    max_annotation_count = annotation_counts.max()
    
    # check if there is a tie for the highest count of annotations
    if (annotation_counts == max_annotation_count).sum() == 1:
        # get the majority annotation for the current post
        majority_annotation = annotation_counts.idxmax()

        # create a new row with the majority annotation and add it to the majority_annotated_posts DataFrame
        new_row = temp_df.iloc[0].copy()
        new_row['annotation'] = majority_annotation
        majority_annotated_posts = pd.concat([majority_annotated_posts, new_row.to_frame().T], ignore_index=True)
    else:
        # there is a tie for the highest count of annotations
        for annotation, count in annotation_counts.items():
            if count == max_annotation_count:
                # create a new row for each annotation that has the highest count and add it to the majority_annotated_posts DataFrame
                new_row = temp_df[temp_df['annotation'] == annotation].iloc[0].copy()
                majority_annotated_posts = pd.concat([majority_annotated_posts, new_row.to_frame().T], ignore_index=True)

# Save in new file
majority_annotated_posts.to_csv('C:/Users/david/OneDrive/Skrivebord/Reddit Analysis/majority_annotated_posts.csv', index=False)

In [2]:
df = pd.read_csv('C:/Users/david/OneDrive/Skrivebord/Reddit Analysis/majority_annotated_posts.csv')

# Exclude rows where 'annotation' column has 'Non-Moral' or 'Thin Morality'
df = df[~df['annotation'].isin(['Non-Moral', 'Thin Morality'])]

# Drop cleaning tools
only_foundations_df = df.drop(['post_ID', 'row_ID'], axis=1)
only_foundations_df.reset_index(inplace=True, drop=True)

In [10]:
### Function for cleaning the posts
def clean_text(text):
    # Remove URLs using urlextract
    extractor = urlextract.URLExtract()
    urls = extractor.find_urls(text)
    for url in urls:
        text = text.replace(url, '')

    # Remove non-ASCII characters using unidecode
    text = unidecode.unidecode(text)

    # Remove extra white spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Spell check
    spell = SpellChecker()
    words = text.split()
    corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in words]
    text = ' '.join(corrected_words)

    return text

only_foundations_df['text'] = only_foundations_df['text'].apply(clean_text)

# Save in new file
only_foundations_df.to_csv('C:/Users/david/OneDrive/Skrivebord/Reddit Analysis/MFRC_clean_for_analysis.csv', index=False)

In [16]:
### Run sentiment analysis

# Open up file
df = pd.read_csv('C:/Users/david/OneDrive/Skrivebord/Reddit Analysis/MFRC_clean_for_analysis.csv')

# Initialize VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Run sentiment analysis on each post store results in new column called 'sentiment_scores'
sentiment = []
for index, row in df.iterrows():
    sentiment_scores = sid.polarity_scores(row['text'])
    row['sentiment_scores'] = sentiment_scores
    sentiment.append(row)

# Convert  sentiment list back to PANDAS DataFrame
df_with_sentiment = pd.DataFrame(sentiment)

### Now to get expand the dictionary in the sentiment_scores into respective columns for individual analysis
# expand the dictionary column into separate columns
sentiment_normalize = pd.json_normalize(df_with_sentiment['sentiment_scores'])

# merge the expanded columns with the original dataframe
final_df = pd.concat([df_with_sentiment, sentiment_normalize], axis=1)

# Save in new file. This file has duplicates of posts with different foundations ascribed to it in case of vote ties
final_df.to_csv('C:/Users/david/OneDrive/Skrivebord/Reddit Analysis/DF_with_foundation_duplicates.csv', index=False)

In [38]:
### Lose all rows which have 'split-ties' over relevant Moral Foundation
df = pd.read_csv('C:/Users/david/OneDrive/Skrivebord/Reddit Analysis/DF_with_foundation_duplicates.csv')
df = df[~df['text'].duplicated(keep=False)]
# Reset the index
df = df.reset_index()
# Save in new file
df.to_csv('C:/Users/david/OneDrive/Skrivebord/Reddit Analysis/DF_Final.csv', index=False)