# Step 1: Merge the data in a single file

**Required Libraries:**
  
  !pip install textblob

  !pip install langdetect

  !pip install torch transformers datasets

In [None]:
!pip install tqdm

In [None]:
!pip install textblob

In [None]:
!pip install langdetect

In [52]:
!pip install texttable

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting texttable
  Downloading texttable-1.6.7-py2.py3-none-any.whl (10 kB)
Installing collected packages: texttable
Successfully installed texttable-1.6.7


In [1]:
# To mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# Import Packages
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
tqdm.pandas()
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
nltk.download('stopwords')
nltk.download('wordnet')
from textblob import TextBlob
from langdetect import detect
from langdetect import LangDetectException

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


## English Review detection steps:

In [None]:
# Load Course Reviews Data
Course_reviews = pd.read_csv('/content/drive/MyDrive/CMU SUBJECTS/DATA/Course_reviews.csv')

In [None]:
def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return None

In [None]:
Course_reviews['lang_detect'] = Course_reviews['reviews'].progress_apply(lambda x: detect_language(str(x)))

In [None]:
# Only select english reviews and load into csv for further use.
Course_reviews_Eng = Course_reviews[Course_reviews['lang_detect'] == 'en']
Course_reviews_Eng.to_csv('/content/drive/MyDrive/CMU SUBJECTS/DATA/Course_reviews_Eng.csv', index=False)

## Pre Processing the course review data


1.   Remove duplicate rows
2.  Convert word with n't to not
3.  Remove any special character
4. Adding sentiment value and sentiment score for the review 



In [22]:
# Load Course Reviews Data
Course_reviews = pd.read_csv('/content/drive/MyDrive/CMU SUBJECTS/DATA/Clean Data/Course_reviews_Eng.csv')

In [23]:
# Drop duplicates records
print("Before dropping duplicate rows:",len(Course_reviews))
Course_reviews = Course_reviews.drop_duplicates()
print("After dropping duplicate rows:",len(Course_reviews))

Before dropping duplicate rows: 1190399
After dropping duplicate rows: 424417


In [24]:
# Removing Non Alphabets characters from review
Course_reviews['reviews'] = Course_reviews['reviews'].progress_apply(lambda x: re.sub(r"n't", " not", x))
Course_reviews['reviews'] = Course_reviews['reviews'].progress_apply(lambda x: re.sub(r'[^\w\s]', ' ', x))


# Lowercasing 
Course_reviews['reviews']=Course_reviews['reviews'].str.lower()

# Remove Whitespace:
def remove_whitespace(text):
    return  " ".join(text.split())

Course_reviews["reviews"] = Course_reviews['reviews'].progress_apply(remove_whitespace)

100%|██████████| 424417/424417 [00:01<00:00, 355818.42it/s]
100%|██████████| 424417/424417 [00:03<00:00, 138158.40it/s]
100%|██████████| 424417/424417 [00:02<00:00, 168574.09it/s]


In [25]:
# Sentiment Polarity score
def text_polarity_score(text):
    temp_dict = {}
    p = TextBlob(text).sentiment.polarity
    if p >= 0.5:
        label = "Positive"
    elif (p >= 0.0):
        label ="Neutral"
    else:
        label ="Negative"
    temp_dict['label'] = label
    temp_dict['score'] = p
    
    return temp_dict

In [26]:
sentiment_dict = Course_reviews['reviews'].progress_apply(text_polarity_score)

100%|██████████| 424417/424417 [02:23<00:00, 2965.52it/s]


In [27]:
Course_reviews['sentiments'] = [d['label'] for d in sentiment_dict]
Course_reviews['sentiments_score'] = [d['score'] for d in sentiment_dict]

In [28]:
Course_reviews.to_csv('/content/drive/MyDrive/CMU SUBJECTS/DATA/coursera_reviews_clean_data.csv', index=False)

## **Merge all the data:**

*   coursera_courses
*   coursera_review_clean_data
*   University_Global_Rankings
*   Coursera_courses_category





In [29]:
# Load Courses details Data
Coursera_courses = pd.read_csv('/content/drive/MyDrive/CMU SUBJECTS/DATA/Clean Data/Coursera_courses.csv')

In [30]:
# Load Courses details Data
Coursera_reviews = pd.read_csv('/content/drive/MyDrive/CMU SUBJECTS/DATA/Clean Data/coursera_reviews_clean_data.csv')

In [31]:
# Load University_Global_Ranking
University_Global_Ranking = pd.read_csv('/content/drive/MyDrive/CMU SUBJECTS/DATA/Clean Data/University_Global_Ranking.csv')

In [32]:
# Load Coursera_courses__category
Coursera_courses_category = pd.read_csv('/content/drive/MyDrive/CMU SUBJECTS/DATA/Clean Data/Coursera_courses_category.csv',encoding='ISO-8859-1')

In [33]:
# Load the data from csv to temp dataframes
temp_merge_df = pd.merge(Coursera_courses,University_Global_Ranking[["institution","World_Rank","I_Category"]],how="inner",on='institution')
temp_merge_df = pd.merge(temp_merge_df,Coursera_courses_category[["course_id","subject"]],how="inner",on='course_id')
temp_merge_df = pd.merge(temp_merge_df,Coursera_reviews[["course_id","reviews","reviewers","date_reviews","rating","sentiments","sentiments_score","lang_detect"]],how="inner",on='course_id')

In [None]:
temp_merge_df.head()

In [35]:
temp_merge_df.to_csv('/content/drive/MyDrive/CMU SUBJECTS/DATA/Clean Data/Coursera_Final_Merge_clean_Data.csv', index=False)

## **Hypothesis Testing**

In [36]:
Final_data = pd.read_csv('/content/drive/MyDrive/CMU SUBJECTS/DATA/Clean Data/Coursera_Final_Merge_clean_Data.csv')

In [57]:
Final_data.head()

Unnamed: 0,name,institution,course_url,course_id,World_Rank,I_Category,subject,reviews,reviewers,date_reviews,rating,sentiments,sentiments_score,lang_detect
0,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,3.0,University,Data Science,this is an extremely basic course machine lear...,By Deleted A,"Mar 18, 2017",1,Neutral,0.04572,en
1,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,3.0,University,Data Science,the course is ok but the certification procedu...,By Bruno C,"Nov 09, 2015",1,Neutral,0.1625,en
2,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,3.0,University,Data Science,i just started week 3 i have to admit that it ...,By Fadi,"Apr 15, 2019",1,Neutral,0.232,en
3,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,3.0,University,Data Science,this course is absolute garbage you get no fee...,By Mathew L,"Sep 25, 2015",1,Negative,-0.154762,en
4,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,3.0,University,Data Science,however good the material and lectures may be ...,By Rui C,"Dec 12, 2015",1,Neutral,0.010833,en


In [53]:
# Import Statstical package
import statsmodels.api as sm
from pprint import pprint
from tabulate import tabulate
from texttable import Texttable

### **Hypothesis testing between sentiment score and rating**

In [38]:
pos_neg_reviews = Final_data[Final_data["sentiments"] !="Neutral"]

In [49]:
X = pos_neg_reviews["sentiments_score"]
Y = pos_neg_reviews['rating']

X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.282
Model:                            OLS   Adj. R-squared:                  0.282
Method:                 Least Squares   F-statistic:                 6.407e+04
Date:                Sat, 15 Apr 2023   Prob (F-statistic):               0.00
Time:                        14:45:37   Log-Likelihood:            -1.5536e+05
No. Observations:              162990   AIC:                         3.107e+05
Df Residuals:                  162988   BIC:                         3.107e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                3.8928      0.004  

In [56]:
dummy_vars_i = pd.get_dummies(pos_neg_reviews['I_Category'], prefix='Inst')
dummy_vars_s = pd.get_dummies(pos_neg_reviews['subject'], prefix='sub')
X = pd.concat([pos_neg_reviews["sentiments_score"],dummy_vars_i,dummy_vars_s], axis=1)
Y = pos_neg_reviews['rating']

X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
summary = model.summary()
print(summary)

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.289
Model:                            OLS   Adj. R-squared:                  0.289
Method:                 Least Squares   F-statistic:                     4722.
Date:                Sat, 15 Apr 2023   Prob (F-statistic):               0.00
Time:                        14:49:36   Log-Likelihood:            -1.5463e+05
No. Observations:              162990   AIC:                         3.093e+05
Df Residuals:                  162975   BIC:                         3.094e+05
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                                           coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------


In [55]:
dummy_vars_i = pd.get_dummies(Final_data['I_Category'], prefix='Inst')
dummy_vars_s = pd.get_dummies(Final_data['subject'], prefix='sub')
X = pd.concat([Final_data["sentiments_score"],dummy_vars_i,dummy_vars_s], axis=1)
Y = Final_data['rating']

X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
summary = model.summary()
print(summary)

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.104
Model:                            OLS   Adj. R-squared:                  0.104
Method:                 Least Squares   F-statistic:                     3529.
Date:                Sat, 15 Apr 2023   Prob (F-statistic):               0.00
Time:                        14:48:34   Log-Likelihood:            -4.4765e+05
No. Observations:              424417   AIC:                         8.953e+05
Df Residuals:                  424402   BIC:                         8.955e+05
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                                           coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------
