# Social Media Analytics
## Project 3
## Graph Analysis - Creating Graphs to Analyze in Gephi (Reddit Posts)
Felix Funes 20220306 | Paula Catalan 20221048 | Efstathia Styliagkatzi 20220078 | Alisson Tapia 20221156 | S M Abrar Hossain Asif 20220223

NOTE: To run properly, this code requires the latest version of the packages. If it gives an error, run "pip install --upgrade pip" and then "pip install --upgrade networkx scipy pandas-profiling numpy" in your terminal.

### Initial Setup

In [1]:
import pandas as pd
from collections import Counter
import networkx as nx
import scipy.sparse as sp
from scipy.sparse import coo_matrix
import numpy as np
from bs4 import BeautifulSoup
import re
import nltk
from nltk.tokenize import word_tokenize
from rake_nltk import Rake
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.corpus import wordnet

In [2]:
# Import data
df = pd.DataFrame(pd.read_excel("https://github.com/felix-funes/social-media-analytics/blob/main/iphone-subreddit-comments.xlsx?raw=true"))
df["body"] = df["body"].astype(str)

### Data Preprocessing

In [3]:
# Defining the text preprocessing function
def textPreProcess(rawText, removeHTML=True, charsToRemove = r'\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-|\$', removeNumbers=True, removeLineBreaks=False, specialCharsToRemove = r'[^\x00-\xfd]', convertToLower=True, removeConsecutiveSpaces=True):
    if type(rawText) != str:
        return rawText
    procText = rawText
        
    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText,'html.parser').get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove)>0:
        procText = re.sub(charsToRemove,' ',procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r'\d+',' ',procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace('\n',' ').replace('\r', '')

    # Remove special characters
    if len(specialCharsToRemove)>0:
        procText = re.sub(specialCharsToRemove,' ',procText)

    # Normalize to lower case
    if convertToLower:
        procText = procText.lower() 

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(' +', ' ', procText)

    return procText

In [4]:
# Tokenize words
def tokenize_words(words):
    if (type(words) != str) or (word_tokenize(words) == ''):
        return np.nan
    else:
        return word_tokenize(words)

In [5]:
# Create a dataframe with only the description and apply the text preprocessing function
processedReviews = pd.DataFrame(data=df.body.apply(textPreProcess, removeNumbers=False).values, index=df.index, columns=['PreProcessedText'])



### Graph Creation

In [6]:
# Extracting the 'PreProcessedText' column from the 'processedReviews' DataFrame and converting it to a list
reviews = processedReviews['PreProcessedText'].tolist()

In [7]:
# Creating an empty list to store words
words = []
# Iterating over each review in the 'reviews' list
for review in reviews:
    # Converting the review text to lowercase and splitting it into individual words
    words += review.lower().split()
# Counting the frequency of each word
word_freq = Counter(words)

In [8]:
# Creating a graph
G = nx.Graph()

In [9]:
# Defining stop words
stop_words = set(stopwords.words('english'))
# Adding to the graph the words not in the stop words list
for word, freq in word_freq.items():
    if word not in stop_words:
        G.add_node(word, size=freq)
    else:
        continue

In [10]:
# Iterate over each review
for review in reviews:
    # Convert the review to lowercase and split it into individual words
    review_words = review.lower().split()

    # Iterate over each word in the review
    for i in range(len(review_words)):
        # Iterate over the remaining words in the review
        for j in range(i+1, len(review_words)):
            # Check if there is an edge between the two words in the graph G
            if G.has_edge(review_words[i], review_words[j]):
                # If an edge exists, increment the weight attribute of the edge by 1
                G[review_words[i]][review_words[j]]['weight'] += 1
            else:
                # If an edge does not exist, add a new edge with weight 1 to the graph G
                G.add_edge(review_words[i], review_words[j], weight=1)

In [11]:
# Create a dictionary that maps each node name to a unique integer index
node_index = {node_name: i for i, node_name in enumerate(G.nodes())}

In [12]:
# Convert the graph to a sparse matrix
n = len(G.nodes())
row, col, data = [], [], []
for u, v, d in G.edges(data=True):
    row.append(node_index[u])
    col.append(node_index[v])
    data.append(d['weight'])
mat = sp.coo_matrix((data, (row, col)), shape=(n, n)).tocsr()

In [13]:
# Filter out nodes that are not nouns, verbs, or adjectives and are not common words
common_words = set(stopwords.words('english')) | {'a', 'an', 'the', 'as', 're', 'm', 'ur', 'v', 'iv', 'o', 'y', 'he', 'were', 'isn', 'd', 'its', 'at', 'or', 'then', 's', 'don', 't', 'on', 'in', 'it', 'i', 'x', 'r', 'ee', 'mo', 'tl', 'l', 'f', 'e', 'n', 'c', 'u', 'w', 'se'}
filtered_nodes = [node for node in G.nodes() if wordnet.synsets(node) and wordnet.synsets(node)[0].pos() in {'n', 'v', 'a'} and node not in common_words]
G = G.subgraph(filtered_nodes)

# Create a dictionary that maps each node name to a unique integer index
node_index = {node_name: i for i, node_name in enumerate(G.nodes())}

# Convert the graph to a sparse matrix again after filtering
n = len(G.nodes())
row, col, data = [], [], []
for u, v, d in G.edges(data=True):
    row.append(node_index[u])
    col.append(node_index[v])
    data.append(d['weight'])
mat = sp.coo_matrix((data, (row, col)), shape=(n, n)).tocsr()

In [14]:
# Calculate the pagerank
pr = nx.pagerank(G, weight='weight')

# Add the pagerank to the node attributes
for i, (node_id, _) in enumerate(G.nodes(data=True)):
    original_index = node_id
    pagerank_value = pr.get(original_index, 0)
    G.nodes[node_id]['pagerank'] = pagerank_value

In [15]:
G.nodes(data=True)

NodeDataView({'wait': {'size': 14, 'pagerank': 0.0018278844567924113}, 'month': {'size': 6, 'pagerank': 0.0010433983806209714}, 'receive': {'size': 3, 'pagerank': 0.0002537924966929369}, 'live': {'size': 9, 'pagerank': 0.0007614127204804455}, 'italy': {'size': 2, 'pagerank': 0.00012779237871135072}, 'matters': {'size': 1, 'pagerank': 8.557801165853086e-05}, 'solo': {'size': 1, 'pagerank': 9.825193581709036e-05}, 'make': {'size': 30, 'pagerank': 0.002005865347207334}, 'great': {'size': 11, 'pagerank': 0.0006308935275452112}, 'leather': {'size': 7, 'pagerank': 0.0004088696703561819}, 'cases': {'size': 16, 'pagerank': 0.001042058007446787}, 'available': {'size': 10, 'pagerank': 0.0010140406790382372}, 'europe': {'size': 2, 'pagerank': 0.00027190917096651956}, 'example': {'size': 5, 'pagerank': 0.00032518156184936046}, 'amazon': {'size': 10, 'pagerank': 0.0006344038148739144}, 'like': {'size': 75, 'pagerank': 0.006114219870442712}, 'lot': {'size': 16, 'pagerank': 0.0010918487539359092}, 'u

In [16]:
# Save the data so it can be loaded into Gephi
nx.write_gexf(G, "reddit graph final.gexf")