In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df1=pd.read_csv("online_classroom_data.csv", decimal='.')

In [None]:
df2=pd.read_csv("online_classroom_data.csv", decimal=',')

# The dataset contains student skill evaluations and the number of reactions received to their posts in e-learning platform, and the main question is to find a link between the reactions and the students' skill levels.

In [None]:
a=[]
for i in range(1,6):
    df1[f"Characteristic_{i}"]= df2[f"sk{i}_classroom"]
    df1.drop(columns=[f"sk{i}_classroom"],axis=0, inplace=True)
    a.append(df1[f"Characteristic_{i}"].mean())
    


In [None]:
a

In [None]:
df1.head()

In [None]:
df1.describe()

In [None]:
df1.isna().sum()

In [None]:
df1.dtypes

# Below we can see the barplot of all 71 students' verification status in e-learning platform

In [None]:
category_approved = df1["Approved"].sum()
category_not_approved = df1["Approved"].count()-df1["Approved"].sum()

# Plotting
plt.bar(["Approved in Platform", "Not approved in Platform"], [category_approved, category_not_approved], color=["purple", "blue"])
plt.xlabel("Categories")
plt.ylabel("Count")
plt.title("Barplot of students verification status")
plt.text(0, category_approved/2, f"{category_approved} students", ha="center", va="center", color="white")
plt.text(1, category_not_approved/2, f"{category_not_approved} students", ha="center", va="center", color="white")
plt.show()

# Total posts for different buckets of time spent online

In [None]:
ax = sns.barplot(y=df1["total_posts"], x= df1["timeonline"],)


# Below we plot histograms for all skill ratings with their appropriate mean value desplayed as vertical dashed line

In [None]:
desc=["Critical Thinking and Problem Solving Skills  of a student","Criativity and Inovation Skills of a student","Constant and Self Learning Skills","Collaboration and Self-Direction Skills  of a student",
  "Social and Cultural Responsability of a student"]
for i in range(1,6):
    charact = df1.filter(regex=f'^Characteristic_{i}')

    # plot histograms for each column
    charact.hist(figsize=(10,6), bins=10)
    plt.axvline(a[i-1], color='purple', linestyle='dashed', linewidth=2)
    plt.text(a[i-1]-0.3,12,'Mean value',rotation=90)
    plt.title(f"{desc[i-1]}")
    plt.show()

# We may also plot the histograms with their density approximation

In [None]:
for i in range(1,6):
    sns.displot(df1[f"Characteristic_{i}"], kde=True)
    plt.show()

# Boxplots for different reactions' intensities students received for their posts

In [None]:
reactions=["total_posts","helpful_post","nice_code_post","collaborative_post",
 "confused_post","creative_post","bad_post","amazing_post"]
df2= df1[reactions]
fig, ax = plt.subplots(figsize=(14, 6))
df2.boxplot(ax=ax)
ax.set_title("Boxplots of reactions students received for their posts")
ax.set_ylabel("Number of reactions")
plt.show()

In [None]:
Q3 = np.quantile(df2, 0.75, axis=0)
Q1 = np.quantile(df2, 0.25, axis=0)
IQR = Q3 - Q1

In [None]:
IQR

# Below we try to identify students  who have reactions out of 2*SD range

In [None]:
outlier_threshold = 2

# Calculate mean and standard deviation for each column
mean = df2.mean()
std = df2.std()

# Identify outlier indexes for each column
outliers = ((df2 - mean).abs() > outlier_threshold *std).any(axis=1)
outlier_indexes = df2.index[outliers]


In [None]:
outliers.sum()

In [None]:
outlier_threshold *std

In [None]:
out=df2[outliers]
out

In [None]:
print(out.index)

In [None]:
df_outliers=df1.loc[df1.index[out.index]]

In [None]:
df_outliers

# We may create correlation heatmaps for both initial data, and the data of students who had out of range reactions on thair posts

In [None]:
plt.figure(figsize=(15,10))
ax=sns.heatmap(df1.corr(), annot=True,cmap="crest" )
ax.set_title("Correlation heatmap for all 71 students")
plt.show()

In [None]:
plt.figure(figsize=(15,10))
ax1=sns.heatmap(df_outliers.corr(), annot=True,cmap="crest" )
ax1.set_title(ax.set_title("Correlation heatmap for 10 students showing high deviance"))
plt.show()

# We may track high(>0.7 or<-0.7) correlation pairs s 
# [Critical Thinking and Problem Solving Skills  of a student and Creative post],
# [Criativity and Inovation Skills of a student and Collaborative_post , Creative_post, and Amazing_post ],
# [Constant and Self Learning Skills and Collaborative_post ,  Creative_post],
# [Collaboration and Self-Direction Skills and timeonline] }