In [None]:
import pandas as pd
import numpy as np

In [None]:
# Loading the main dataset before cleaning
original = pd.read_csv('animes.csv')

# Basic summary stats of Anime dataset
print("Basic summary statastics : \n")
print(original.info())

In [None]:
# Selecting the columns to keep in the desired dataset
columns_to_keep = ['title', 'genre', 'aired', 'episodes', 'members', 'popularity', 'ranked', 'score']
cleaned_df = original[columns_to_keep]

# Removing rows with missing values in any column
cleaned_df = cleaned_df.dropna()

# Removing rows where the 'genre' column is an empty list
cleaned_df = cleaned_df[cleaned_df['genre'].apply(lambda x: len(eval(x)) > 0)]

# Removing duplicate entries, if any
cleaned_df = cleaned_df.drop_duplicates()

# Defining the number of samples
num_samples = 300

# Performing simple random sampling
sampled_df = cleaned_df.sample(n=num_samples)

# Saving the sampled and cleaned dataset to a CSV file
sampled_df.to_csv('cleaned_dataset.csv', index=False)

In [None]:
# Load the cleaned dataset.
cleaned = pd.read_csv('cleaned_dataset.csv')

# Basic summary stats of cleaned sample dataset
print("Basic summary statastics : \n")
print(cleaned.info())

In [None]:
# Significance level for all below tests:
alpha = 0.05

In [None]:
# One Sample Z-Test:

In [None]:
print("One sample z-test: ")

print("The average score is more than 6.5. (claim)")
u = 6.5

print("H0: u <= 6.5\n")
print("H1: u > 6.5 (claim)")

In [None]:
sample_size = 50
sample_data = cleaned['score'].head(sample_size)
sample_mean = sample_data.mean()
sample_std = sample_data.std()
print("The sample has mean ",sample_mean, "and standard deviation ", sample_std, ".")

In [None]:
z_cal = (sample_mean - u)/(sample_std/np.sqrt(sample_size))

print("Z calculated is ", z_cal)

In [None]:
print("At significance level of 0.05, z tabulated for one tailed test is 1.645 .")
z_tab_positive = 1.645
z_tab_negative = -1.645

In [None]:
if z_tab_negative < z_cal < z_tab_positive:
    print("Accept H0. The mean score is less than 6.5.")
else:
    print("Reject H0. The mean score is more than 6.5.")

In [None]:
# Two Sample Z-Test:

In [None]:
print("Two sample z-test: \n")

print("H0: u1 = u2\n")
print("H1: u1 != u2")

In [None]:
sample_data = cleaned[['popularity', 'score']].head(100)

setA = sample_data.loc[sample_data['popularity'] > 10000, 'score'].tolist()
print("Set A with popularity > 10000 : ",setA)
n1 = len(setA)
print("\nn1: ",n1)

setB = sample_data.loc[sample_data['popularity'] < 10000, 'score'].tolist()
print("\nSet B with popularity < 10000 : ",setB)
n2 = len(setB)
print("\nn2: ",n2)

In [None]:
u1 = 5
u2 = 7
print("u1 = 5 and u2 = 7")

In [None]:
sigma1 = np.std(setA)
sigma2 = np.std(setB)

print("Sigma1: ",sigma1," Sigma2: ",sigma2)

In [None]:
SDE = np.sqrt(((sigma1)**2/n1)+((sigma2)**2/n2))
print(SDE)

In [None]:
x1 = np.mean(setA)
x2 = np.mean(setB)

print("X1: ", x1, " X2: ", x2)

In [None]:
z_cal = (x1 - x2) - (u1 - u2) / SDE

print("z calculated is ",z_cal)

In [None]:
z_tab_positive = 1.960
z_tab_negative = -1.960

print("z tab for two tailed test with significance level 0.05 is", z_tab_positive)

In [None]:
if z_tab_negative < z_cal < z_tab_positive:
    print("Accept H0. u1 = u2.")
else:
    print("Reject H0. u1 != u2.")

In [None]:
# One sample t-test:

In [None]:
print("One sample t-test: ")

print("The average score is 5. (claim)")
u = 5

print("H0: u = 5\n")
print("H1: u != 5 (claim)")

In [None]:
sample_size = 25
sample_data = cleaned['score'].head(sample_size)
sample_mean = sample_data.mean()
sample_std = sample_data.std()
print("The sample has mean",sample_mean, "and standard deviation", sample_std, ".")

In [None]:
t_cal = (sample_mean - u)/(sample_std/np.sqrt(sample_size))

print("t calculated is", z_cal)

In [None]:
print("At significance level of 0.05, and df = 25 - 1 = 24, t tabulated for two tailed test is 2.0639.")
t_tab_positive = 2.0639
t_tab_negative = -2.0639

In [None]:
if t_tab_negative < t_cal < t_tab_positive:
    print("Accept H0. The average score is 5.")
else:
    print("Reject H0. The average score is not 5.")

In [None]:
# Two sample t-test:

In [None]:
print("Two sample t-test: \n")

print("H0: u1 = u2\n")
print("H1: u1 != u2")

In [None]:
sample_data = cleaned[['popularity', 'score']].head(40)

setA = sample_data.loc[sample_data['popularity'] > 10000, 'score'].tolist()
print("Set A with popularity > 10000 : ",setA)
n1 = len(setA)
print("\nn1: ",n1)

setB = sample_data.loc[sample_data['popularity'] < 10000, 'score'].tolist()
print("\nSet B with popularity < 10000 : ",setB)
n2 = len(setB)
print("\nn2: ",n2)

In [None]:
s1 = np.std(setA)
s2 = np.std(setB)

print("S1: ",s1," S2: ",s2)

In [None]:
sp = np.sqrt(((n1 - 1)*(s1)**2 + (n2 - 1)*(s2)**2)/((n1 - 1) + (n2 - 1)))
print(sp)

In [None]:
x1 = np.mean(setA)
x2 = np.mean(setB)

print("X1: ", x1, " X2: ", x2)

In [None]:
t_cal = (x1 - x2) / (sp*(np.sqrt((1/n1)+(1/n2))))
print("T calculated: ",t_cal)

In [None]:
print("At significance level of 0.05, and df = (17 - 1) + (23 - 1) = 38, t tabulated for one tailed test is 2.024 .")
t_tab_positive = 2.024
t_tab_negative = -2.024

In [None]:
if t_tab_negative < t_cal < t_tab_positive:
    print("Accept H0.")
else:
    print("Reject H0.")

In [None]:
# One sample z-test for proportions:

In [None]:
print("One sample z-test for proportions:\n")

print("40 percent of the animes have more than 1 episodes. (claim)\n")

print("H0: p = 0.4 (claim)\n")
print("H1: p != 0.4")
p = 0.4
q = 1 - p

In [None]:
sample_size = 100
Np = sample_size * p
Nq = sample_size * (1-p)

if (Np > 5) and (Nq > 5):
    print("Np and Nq verified.")
else:
    print("Np and Nq condition not satisfied.")

In [None]:
sample_data = cleaned['episodes'].head(100)

# Count the anime with more than 1 episodes
count = (sample_data > 1).sum()

# Calculate the proportion
p_cap = count / len(sample_data)

print("Proportion of animes with more than 1 episodes:", p_cap)


In [None]:
z_cal = (p_cap - p)/np.sqrt((p*q)/sample_size)
print("z calculated:",z_cal)

In [None]:
print("At significance level of 0.05, z tabulated for two tailed test is 1.96.")
z_tab_positive = 1.96
z_tab_negative = -1.96

In [None]:
if z_tab_negative < z_cal < z_tab_positive:
    print("Accept H0. 40 percent of the animes have more than 1 episodes.\n")
else:
    print("Reject H0. 40 percent of the animes do not have more than 1 episodes.")

In [None]:
# Two sample z-test for proportions:

In [None]:
print("Two sample z-test for proportions:\n")

print("There are more animes with episodes more than 5.\n")

print("H0: p1 >= p2\n")
print("H1: p1 < p2")

In [None]:
sample_data = cleaned['episodes']
n = len(sample_data)
# Count the anime with more than 5 episodes
count1 = (sample_data > 5).sum()

# Count the anime with less than 5 episodes
count2 = (sample_data <= 5).sum()

# Calculate the proportion
p = (count1 + count2) / (n+n)
q = 1 - p

print("p =", p," and q =", q)


In [None]:
p1_cap = count1 / n
p2_cap = count2 / n
print("p1_cap =",p1_cap, "and p2_cap =", p2_cap)

In [None]:
z_cal = ((p1_cap - p2_cap) - 0) / np.sqrt(p*q*((1/n)+(1/n)))
print("z calculated:",z_cal)

In [None]:
z_tab_positive = 1.645
z_tab_negative = -1.645
print("z tabulated:",z_tab_positive)

In [None]:
if z_tab_negative < z_cal < z_tab_positive :
    print("Accept H0.")
else:
    print("Reject H0.")