# Social Media Analytics
## Project 3
## Graph Analysis - Creating Graphs to Analyze in Gephi (Best Buy Reviews)
Felix Funes 20220306 | Paula Catalan 20221048 | Efstathia Styliagkatzi 20220078 | Alisson Tapia 20221156 | S M Abrar Hossain Asif 20220223

NOTE: To run properly, this code requires the latest version of the packages. If it gives an error, run "pip install --upgrade pip" and then "pip install --upgrade networkx scipy pandas-profiling numpy" in your terminal.

### Initial Setup

In [1]:
import pandas as pd
from collections import Counter
import networkx as nx
import matplotlib.pyplot as plt
import scipy.sparse as sp
from scipy.sparse import coo_matrix
import numpy as np
# Import packages
import csv
import pandas as pd
import numpy as np
import nltk
import re
from nltk.tokenize import word_tokenize
import re
from bs4 import BeautifulSoup
from rake_nltk import Rake
from nltk.corpus import stopwords

In [2]:
# Import data
df = pd.DataFrame(pd.read_excel("https://github.com/Alito06/FinalProjectSMA/raw/main/ExtractedReviewsDataCollection_bestbuy.xlsx"))

In [3]:
df

Unnamed: 0.1,Unnamed: 0,device,user,rating,text,date,ownership_length
0,0,Apple - iPhone 14 128GB - Midnight (Verizon),BigG,5,Apple makes the best cellphone on the market h...,2023-02-03,less than 1 week
1,1,Apple - iPhone 14 128GB - Midnight (Verizon),Jp44087,5,"Ease of use, good battery life, 128gb fits me ...",2023-02-03,3 weeks
2,2,Apple - iPhone 14 128GB - Midnight (Verizon),GamerDadLife,5,Love it works great and the red color is the m...,2022-12-24,2 weeks
3,3,Apple - iPhone 14 128GB - Midnight (Verizon),LevanaP,5,Been a long time iPhone user. This is a awesom...,2023-04-14,1 week
4,4,Apple - iPhone 14 128GB - Midnight (Verizon),Anonymous,5,My wife dropped her phone right AFTER the Appl...,2023-04-15,3 weeks
...,...,...,...,...,...,...,...
369,369,Apple - iPhone 14 128GB - Purple (T-Mobile),Heart,3,Value for the $$$. Security a headache. It is ...,2023-02-24,1 week
370,370,Apple - iPhone 14 128GB - Purple (T-Mobile),CharlesK,5,My mom got this and she loves this phone the n...,2023-01-08,Unknown
371,371,Apple - iPhone 14 128GB - Purple (T-Mobile),Darklight,5,I loved it because the camra looks great abd d...,2022-09-19,Unknown
372,372,Apple - iPhone 14 128GB - Purple (T-Mobile),user482290,1,I went into the store with my wife and child t...,2023-02-05,Unknown


### Data Preprocessing

In [4]:
# Defining the text preprocessing function
def textPreProcess(rawText, removeHTML=True, charsToRemove = r'\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-|\$', removeNumbers=True, removeLineBreaks=False, specialCharsToRemove = r'[^\x00-\xfd]', convertToLower=True, removeConsecutiveSpaces=True):
    if type(rawText) != str:
        return rawText
    procText = rawText
        
    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText,'html.parser').get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove)>0:
        procText = re.sub(charsToRemove,' ',procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r'\d+',' ',procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace('\n',' ').replace('\r', '')

    # Remove special characters
    if len(specialCharsToRemove)>0:
        procText = re.sub(specialCharsToRemove,' ',procText)

    # Normalize to lower case
    if convertToLower:
        procText = procText.lower() 

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(' +', ' ', procText)

    return procText

In [5]:
# Tokenize words
def tokenize_words(words):
    if (type(words) != str) or (word_tokenize(words) == ''):
        return np.nan
    else:
        return word_tokenize(words)

In [6]:
# Create a dataframe with only the description and apply the text preprocessing function
processedReviews = pd.DataFrame(data=df.text.apply(textPreProcess, removeNumbers=False).values, index=df.index, columns=['PreProcessedText'])



### Graph Creation

In [7]:
# Extracting the 'PreProcessedText' column from the 'processedReviews' DataFrame and converting it to a list
reviews = processedReviews['PreProcessedText'].tolist()

In [8]:
# Creating an empty list to store words
words = []
# Iterating over each review in the 'reviews' list
for review in reviews:
    # Converting the review text to lowercase and splitting it into individual words
    words += review.lower().split()
# Counting the frequency of each word
word_freq = Counter(words)

In [9]:
# Creating a graph
G = nx.Graph()
# Defining stop words
stop_words = set(stopwords.words('english'))
# Adding to the graph the words not in the stop words list
for word, freq in word_freq.items():
    if word not in stop_words:
        G.add_node(word, size=freq)
    else:
        continue

In [10]:
# Iterate over each review
for review in reviews:
    # Convert the review to lowercase and split it into individual words
    review_words = review.lower().split()

    # Iterate over each word in the review
    for i in range(len(review_words)):
        # Iterate over the remaining words in the review
        for j in range(i+1, len(review_words)):
            # Check if there is an edge between the two words in the graph G
            if G.has_edge(review_words[i], review_words[j]):
                # If an edge exists, increment the weight attribute of the edge by 1
                G[review_words[i]][review_words[j]]['weight'] += 1
            else:
                # If an edge does not exist, add a new edge with weight 1 to the graph G
                G.add_edge(review_words[i], review_words[j], weight=1)

In [11]:
# Create a dictionary that maps each node name to a unique integer index
node_index = {node_name: i for i, node_name in enumerate(G.nodes())}

In [12]:
# Convert the graph to a sparse matrix
n = len(G.nodes())
row, col, data = [], [], []
for u, v, d in G.edges(data=True):
    row.append(node_index[u])
    col.append(node_index[v])
    data.append(d['weight'])
mat = sp.coo_matrix((data, (row, col)), shape=(n, n)).tocsr()

In [13]:
# Calculate the pagerank
pr = nx.pagerank(G, weight='weight')

# Add the pagerank to the node attributes
for i, (node_id, _) in enumerate(G.nodes(data=True)):
    G.nodes[node_id]['pagerank'] = pr.get(i, 0)

In [14]:
# Save the data so it can be loaded into Gephi
nx.write_gexf(G, "best buy graph.gexf")