# Brief Introduction

This is the 2023 revison of the **Social Media impact on the Crowdfunding Success** project. In this revison, there are a few changes made to the project. The changes are listed [here](https://www.evernote.com/shard/s458/sh/5feb1b0b-d2b6-d0d2-ce9e-1d65a87ebb81/).

In [440]:
import os
import nltk
import json
import spacy
import gensim
import pyLDAvis
import pyLDAvis.gensim_models 
import warnings
import torch
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
warnings.filterwarnings("ignore", category=DeprecationWarning)

torch.set_num_threads(4)

current_dir = r'/home/theerthala/Documents/repos/Crowdfunding-Social-Media-Drivers/Data/Original_dataset/'

nlp = spacy.load('en_core_web_trf')
stop = stopwords.words('english')
post_data = pd.read_csv(current_dir+"2022-07-22-09-31-10-IST-feed-download.csv")


#basic data cleaning
post_data.drop_duplicates(subset=['Message'], keep='last', inplace=True)
post_data.reset_index(inplace=True, drop=True)
post_data.columns = [x.lower().replace(' ','_') for x in post_data.columns.to_list()]
post_data.drop(['user_name', 'facebook_id', 'page_category', 'page_admin_top_country',
                'overperforming_score_(weighted__—__likes_1x_shares_1x_comments_1x_love_1x_wow_1x_haha_1x_sad_1x_angry_1x_care_1x_)'],
                 axis=1, inplace=True)
post_data.head()

Unnamed: 0,page_name,page_created,likes_at_posting,followers_at_posting,post_created,post_created_date,post_created_time,type,total_interactions,likes,...,url,message,link,final_link,image_text,link_text,description,sponsor_id,sponsor_name,sponsor_category
0,GoFundMe,10-02-2010 02:10,1839689,1942958.0,2022-07-22 01:22:32 IST,22-07-2022,01:22:32,Photo,21,9,...,https://www.facebook.com/299947508549/posts/10...,“I knew I had to do this with as many veterans...,https://www.facebook.com/gofundme/photos/a.101...,,HONOR,,,,,
1,Kickstarter,21-04-2009 16:42,1589842,1587991.0,2022-07-22 01:18:36 IST,22-07-2022,01:18:36,Link,3,1,...,https://www.facebook.com/73182029884/posts/101...,Artist and designer Janos Stone is creating a ...,https://www.kickstarter.com/projects/haus/toyh...,https://www.kickstarter.com/projects/haus/toyh...,,ToyHaus: a uniquely beautiful mini-playhouse f...,"ToyHaus is a washable, popup mini-playhouse wi...",,,
2,Kickstarter,21-04-2009 16:42,1589842,1587991.0,2022-07-21 18:52:39 IST,21-07-2022,18:52:39,Link,9,4,...,https://www.facebook.com/73182029884/posts/101...,The inaugural meeting of Kickstarter’s Communi...,https://www.kickstarter.com/blog/reporting-bac...,https://www.kickstarter.com/blog/reporting-bac...,,Reporting Back from the First Meeting of the K...,"Earlier this month, I was honored to host the ...",,,
3,GoFundMe,10-02-2010 02:10,1839047,1942443.0,2022-07-21 00:23:03 IST,21-07-2022,00:23:03,Native Video,163,42,...,https://www.facebook.com/299947508549/posts/10...,This 25-year-old man rushed into a burning hou...,https://www.facebook.com/peoplemag/videos/2043...,,,People,"""Nicholas Bostic's heroic actions saved lives,...",,,
4,Kickstarter,21-04-2009 16:42,1589866,1588006.0,2022-07-20 02:38:36 IST,20-07-2022,02:38:36,Link,7,5,...,https://www.facebook.com/73182029884/posts/101...,An occult-themed JRPG with a fully integrated ...,https://www.kickstarter.com/projects/cherrymoc...,https://www.kickstarter.com/projects/cherrymoc...,,EXIT VEIL: Occult & Tarot JRPG,A Dark-Psychedelic JRPG & fully integrated Tar...,,,


## Basic pre_processing and EDA

* Entity Recognition
* Stopword Removal
* Lemmatization
* Sponsor effect investigation
* Outlier Removal

In [441]:
text_columns = set(['message', 'image_text','link_text','description'])
for col in text_columns:
    post_data[col] = post_data[col].str.lower()
    post_data[col] = post_data[col].str.replace(r'http?.*.com?', ' ', regex=True)
    post_data[col] = post_data[col].str.replace('[^\w\s]','', regex=True)

# Extracting NERs out of the messages and other text bodies
def recognise_entities(col, entities):
    null_count = 0
    for i in tqdm(range(len(post_data))):
        try:
            doc = nlp(post_data[col][i])
            entities.append((i, [(ent.text, ent.label_) for ent in doc.ents]))
        except (TypeError, ValueError):
            null_count += 1
    
    print(f"Number of null values in {col}: {null_count}")
    return entities
    
        
def get_entities(current_dir, text_columns):
    if os.path.isfile(current_dir+'idx_to_list.json'):
        with open(current_dir+'idx_to_list.json', 'r') as f:
            idx_to_list = json.load(f)
    
    else:
        if os.path.isfile(current_dir+'entities.json'):
            with open(current_dir+'entities.json', 'r') as f:
                entities = json.load(f)
        else:
            entities=dict()
            for text_col in text_columns:
                temp_entities = []
                entities[text_col] = recognise_entities(text_col, temp_entities)

        indices = []

        for key in entities.keys():
            indices+= [x[0] for x in entities[key]]

        indices = list(set(indices))

        idx_to_list = dict()
        for key in entities.keys():
            for idx, recognised_entity_list in entities[key]:
                if idx in idx_to_list.keys():
                    idx_to_list[idx] += (recognised_entity_list)
                else:
                    idx_to_list[idx] = recognised_entity_list
                idx_to_list[idx] = list(set(idx_to_list[idx]))

            
    return idx_to_list


all_entities = get_entities(current_dir, text_columns)
indices = sorted([int(key) for key in all_entities.keys()])
all_entity_types = [[y for x, y in entity] for entity in [all_entities[str(idx)] for idx in indices]]

set_all_entities = []
[set_all_entities.extend(x) for x in all_entity_types]

unique_entity_types = list(set(set_all_entities))
entity_type_count = [(x, set_all_entities.count(x)) for x in unique_entity_types]
entity_type_count.sort(key=lambda x: x[1], reverse=True)    
post_data.loc[:, 'entities_identified'] = pd.Series(all_entity_types, index=indices)
post_data.loc[:, 'entities_identified'] = post_data['entities_identified'].apply(lambda x: list(set(x)))
entity_counts = post_data['entities_identified'].explode().value_counts()
sorted_entities = entity_counts.index.to_list()

# print("Entities identified: ", len(sorted_entities))

# Making sure that the data doesn't have null values in the empty columns
post_data.loc[:, list(text_columns)] = post_data.loc[:, list(text_columns)].fillna(' ')
# [post_data[x].isnull().sum() for x in text_columns]

post_data.loc[:, list(text_columns)]

# removing stopwords and links in the texts
for text_col in text_columns:
    post_data[text_col] = post_data[text_col].apply(lambda x: ' '.join([word.strip() for word in x.split() if word not in (stop)]))

def lemmatize_text(col):
    nlp = spacy.load('en_core_web_trf')
    processed_col = post_data[col].apply(nlp)
    processed_col = processed_col.apply(lambda x: [token.lemma_ for token in x if token.lemma_ not in stop])
    processed_col = processed_col.apply(lambda x: ' '.join(list(set(x))))
    return processed_col

#If lemmatized data is not available, then start lemmatization.
if os.path.isfile(current_dir+'temp_lemmatized_text.csv'):
    post_data.loc[:, list(text_columns)]= pd.read_csv(current_dir+'temp_lemmatized_text.csv')
else:
    for text_col in text_columns:
        post_data[text_col] = lemmatize_text(text_col)

    post_data.loc[:, list(text_columns)].to_csv('temp_lemmatized_text.csv', index=False)


# Creating a column for the number of entities identified in each post
post_data['post_sponsored'] = (post_data.sponsor_id.notnull()).astype(int)

#Clipping the dataset.
max_likes = round(np.quantile(post_data.likes, 0.99))
max_comments = round(np.quantile(post_data.comments, 0.99))
max_shares = round(np.quantile(post_data.shares, 0.99))

post_data = post_data[post_data.likes<max_likes].reset_index(drop=True)
post_data = post_data[post_data.comments<max_comments].reset_index(drop=True)
post_data = post_data[post_data.shares<max_shares].reset_index(drop=True)

#Calculating Age of the post and the page
post_data.page_created = pd.to_datetime(post_data.page_created, dayfirst=True)
post_data.post_created = pd.to_datetime(post_data['post_created'].str.replace('IST', ''))
post_data['current_date'] = pd.to_datetime('2022-07-22 09:31:10') 
post_data['post_age'] = (post_data.current_date - post_data.post_created).dt.days
post_data['page_age'] = (post_data.current_date - post_data.page_created).dt.days

#grouping public engagements into two broad categories
post_data['positive_reactions'] = post_data['love'] + post_data['haha'] + post_data['wow'] + post_data['care']
post_data['negative_reactions'] = post_data['sad'] + post_data['angry']
post_data['emoji_reactions'] = post_data['positive_reactions'] + post_data['negative_reactions'] + post_data['likes']

#Calculating overall views
zero_view_indices = post_data[post_data.post_views ==0].index.to_list()
non_zero_view_indices = post_data[post_data.post_views !=0].index.to_list()

post_data.loc[zero_view_indices, 'post_views'] = post_data.loc[zero_view_indices, ['emoji_reactions', 'comments', 'shares', 'total_views', 'total_views_for_all_crossposts']].max(axis=1).astype(int)
post_data.loc[non_zero_view_indices, 'post_views'] = post_data.loc[non_zero_view_indices, ['post_views', 'emoji_reactions', 'comments', 'shares', 'total_views', 'total_views_for_all_crossposts']].max(axis=1).astype(int)

#filling followers_at_posting null values
likes_and_following = post_data[[ 'likes_at_posting', 'followers_at_posting']]
null_indices = likes_and_following[likes_and_following.followers_at_posting.isnull()].index.to_list()


#loading the model
with open(current_dir+'followers_model.pkl', 'rb') as f:
    lr = joblib.load(f)

needed = likes_and_following.loc[null_indices, 'likes_at_posting'].values.reshape(-1, 1)
predicted = lr.predict(needed)
likes_and_following.loc[null_indices, 'followers_at_posting'] = predicted
post_data['followers_at_posting'] = likes_and_following['followers_at_posting'].astype(int)


post_data.reset_index(drop=True, inplace=True)
 

In [442]:
dropping_cols = ['sponsor_id', 'sponsor_name', 'sponsor_category', 'page_created', 'post_created', 'current_date',
                 'total_interactions', 'post_created_date', 'post_created_time', 'love', 'haha', 'wow', 'sad', 'angry',
                 'care',  'total_views', 'total_views_for_all_crossposts', 'url', 'link', 'final_link']

post_data.drop(dropping_cols, axis=1, inplace=True)

## inprogress

In [443]:
post_data.columns

Index(['page_name', 'likes_at_posting', 'followers_at_posting', 'type',
       'likes', 'comments', 'shares', 'video_share_status', 'is_video_owner?',
       'post_views', 'video_length', 'message', 'image_text', 'link_text',
       'description', 'entities_identified', 'post_sponsored', 'post_age',
       'page_age', 'positive_reactions', 'negative_reactions',
       'emoji_reactions'],
      dtype='object')

In [452]:
cols_left = ['type', 'video_share_status', 'is_video_owner?', 'video_length']
post_data[cols_left]

Unnamed: 0,type,video_share_status,is_video_owner?,video_length
0,Photo,,-,
1,Link,,-,
2,Link,,-,
3,Native Video,share,No,00:01:04
4,Link,,-,
...,...,...,...,...
8781,Link,,-,
8782,Video,,-,
8783,Native Video,owned,Yes,00:00:55
8784,Native Video,owned,Yes,00:00:15


In [454]:
post_data.video_length.notnull().value_counts()

video_length
False    5974
True     2812
Name: count, dtype: int64

In [None]:
14992516676.52882
523428601352.95355
14992516676.52882

## Preprocessing