In [14]:
# Import Packages
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [17]:
# Load Courses details Data
final_data = pd.read_csv('Coursera_Final_Merge_clean_Data.csv')
final_data = final_data[final_data["clean_reviews"].isna() == False]

In [32]:
dummy_vars_i = pd.get_dummies(final_data['I_Category'], prefix='Inst')
dummy_vars_s = pd.get_dummies(final_data['subject'], prefix='sub')
X = pd.concat([final_data["sentiments_score"],dummy_vars_i,dummy_vars_s], axis=1)
Y = final_data['rating']

X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
summary = model.summary()
print(summary)

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.104
Model:                            OLS   Adj. R-squared:                  0.104
Method:                 Least Squares   F-statistic:                     3514.
Date:                Sat, 22 Apr 2023   Prob (F-statistic):               0.00
Time:                        17:28:27   Log-Likelihood:            -4.4768e+05
No. Observations:              424380   AIC:                         8.954e+05
Df Residuals:                  424365   BIC:                         8.956e+05
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                                           coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------


In [35]:
dummy_vars_i = pd.get_dummies(final_data['institution'], prefix='')
dummy_vars_s = pd.get_dummies(final_data['subject'], prefix='sub')
X = pd.concat([final_data["sentiments_score"],dummy_vars_i,dummy_vars_s], axis=1)
Y = final_data['rating']

X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
summary = model.summary()
print(summary)

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.126
Model:                            OLS   Adj. R-squared:                  0.125
Method:                 Least Squares   F-statistic:                     426.2
Date:                Sat, 22 Apr 2023   Prob (F-statistic):               0.00
Time:                        23:40:16   Log-Likelihood:            -4.4248e+05
No. Observations:              424380   AIC:                         8.852e+05
Df Residuals:                  424236   BIC:                         8.868e+05
Df Model:                         143                                         
Covariance Type:            nonrobust                                         
                                                               coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------

#### For Positive And Negative Results

In [38]:
pos_neg_reviews = final_data[final_data["sentiments"] !="Neutral"]

In [39]:
dummy_vars_i = pd.get_dummies(pos_neg_reviews['I_Category'], prefix='Inst')
dummy_vars_s = pd.get_dummies(pos_neg_reviews['subject'], prefix='sub')
X = pd.concat([pos_neg_reviews["sentiments_score"],dummy_vars_i,dummy_vars_s], axis=1)
Y = pos_neg_reviews['rating']

X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
summary = model.summary()
print(summary)

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.285
Model:                            OLS   Adj. R-squared:                  0.285
Method:                 Least Squares   F-statistic:                     4605.
Date:                Sun, 23 Apr 2023   Prob (F-statistic):               0.00
Time:                        19:13:25   Log-Likelihood:            -1.5796e+05
No. Observations:              161417   AIC:                         3.159e+05
Df Residuals:                  161402   BIC:                         3.161e+05
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                                           coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------


In [33]:
dummy_vars_i = pd.get_dummies(pos_neg_reviews['institution'], prefix='')
dummy_vars_s = pd.get_dummies(pos_neg_reviews['subject'], prefix='sub')
X = pd.concat([pos_neg_reviews["sentiments_score"],dummy_vars_i,dummy_vars_s], axis=1)
Y = pos_neg_reviews['rating']

X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
summary = model.summary()
print(summary)

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.299
Model:                            OLS   Adj. R-squared:                  0.298
Method:                 Least Squares   F-statistic:                     480.4
Date:                Sat, 22 Apr 2023   Prob (F-statistic):               0.00
Time:                        23:37:55   Log-Likelihood:            -1.5644e+05
No. Observations:              161417   AIC:                         3.132e+05
Df Residuals:                  161273   BIC:                         3.146e+05
Df Model:                         143                                         
Covariance Type:            nonrobust                                         
                                                               coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------

In [34]:
# Make predictions
predictions = model.predict()

# Calculate MSE
mse = mean_squared_error(Y, predictions)

# Calculate RMSE
rmse = np.sqrt(mse)

# Calculate MAE
mae = mean_absolute_error(Y, predictions)

# Calculate R-squared
r_squared = r2_score(Y, predictions)

# Print the calculated metrics
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared:", r_squared)


Mean Squared Error (MSE): 0.40676419006018383
Root Mean Squared Error (RMSE): 0.6377806755148543
Mean Absolute Error (MAE): 0.4212565188868819
R-squared: 0.298721103156379


#### For University Data only

In [36]:
u_pos_neg_reviews= pos_neg_reviews[(pos_neg_reviews["I_Category"] == "University") & (pos_neg_reviews["World_Rank"].isna() == False)]

In [37]:
dummy_vars_i = pd.get_dummies(u_pos_neg_reviews['institution'], prefix='U')
dummy_vars_s = pd.get_dummies(u_pos_neg_reviews['subject'], prefix='sub')
X = pd.concat([u_pos_neg_reviews["sentiments_score"],dummy_vars_i,dummy_vars_s], axis=1)
Y = u_pos_neg_reviews['rating']

X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
summary = model.summary()
print(summary)

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.281
Model:                            OLS   Adj. R-squared:                  0.280
Method:                 Least Squares   F-statistic:                     437.4
Date:                Sun, 23 Apr 2023   Prob (F-statistic):               0.00
Time:                        19:12:10   Log-Likelihood:            -1.0853e+05
No. Observations:              114446   AIC:                         2.173e+05
Df Residuals:                  114343   BIC:                         2.183e+05
Df Model:                         102                                         
Covariance Type:            nonrobust                                         
                                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------