In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
from matplotlib import pyplot as plt
from scipy.stats import chi2_contingency


In [None]:
# 1번 자료

df = pd.read_csv('./data/cookie_cats.csv')
df

In [None]:
# 2번 자료

df.info()

In [None]:
# 3번 자료

df.isnull().sum()

In [None]:
# 4번 자료

df.nunique()

In [None]:
# 5번 자료
# version 개별 수 파악

version_count = df['version'].value_counts()

sns.set_style('whitegrid')
version_count.plot(kind='bar')
plt.title('Count version')
plt.text(0,45489,45489)
plt.text(1,44699,44699)
_=plt.xticks(rotation=0)

In [None]:
# 6번
# 전체 범위 파악

df[['userid','sum_gamerounds']].sort_values('sum_gamerounds')

# 약 5만 값 1개 ==> 무의미 판단 ==> drop

df = df.drop(df[df['userid'] == 6390605].index,axis = 0)

sns.scatterplot(
    data = df,
    x = 'userid',
    y = 'sum_gamerounds'
).set(title='Gamerounds by Scatterplot')

In [None]:
# 6-1번

# 전체 범위 파악
df['sum_gamerounds'].sort_values()

# 그래프 그리기
df['sum_gamerounds'].plot(kind='hist',bins=np.arange(0,250,3))
plt.title('Gamerounds by Histogram')

In [None]:
# 7번 자료
# version별 게임 라운드진행 산포도 그래프

sns.stripplot(
    data = df,
    x = 'version',
    y = 'sum_gamerounds',
    size = 3,
    palette = 'pastel'
).set(title = 'Gamerounds by Version')
plt.ylim(0,2500)

In [None]:
print(df[df['version'] == 'gate_30']['sum_gamerounds'].mean())
print(df[df['version'] == 'gate_40']['sum_gamerounds'].mean())

In [None]:
plt.figure(figsize=(12,5))

plt.subplot(1,2,1)
sns.countplot(
    data = df,
    x = 'version',
    hue = 'retention_1'
    ).set(title = 'retention_1 by version')

plt.subplot(1,2,2)
sns.countplot(
    data = df,
    x = 'version',
    hue = 'retention_7'
    ).set(title = 'retention_7 by version')



In [None]:
test_names = ['sum_gamerounds']

gate_30 = df[df['version'] == 'gate_30'][['sum_gamerounds']]
gate_40 = df[df['version'] == 'gate_40'][['sum_gamerounds']]

for test_name in test_names:
    statistic,p_value = stats.levene(gate_30[test_name], gate_40[test_name])
    if p_value > 0.05:
        print(f"{test_name} p-value: {p_value}, 등분산 가정 만족")
    else:
        print(f"{test_name} p-value: {p_value}, 이분산 가정 만족")

In [None]:
for test_name in test_names:
    t_statistic,pvalue = stats.ttest_ind(
        a = gate_30[test_name],
        b = gate_40[test_name],
        alternative = 'two-sided',
        equal_var = True
)
print(f"p-value: {p_value}")
print(f"귀무 가설 기각: {p_value < 0.05}")

In [None]:
df_3 = df.sort_values(by="sum_gamerounds",ascending=False,ignore_index=True)[:9018]
test_names = ["sum_gamerounds"]

df_3.groupby(["version"])[["sum_gamerounds"]].mean()

version_30 = df_3[df_3["version"] =="gate_30" ][["sum_gamerounds"]]
version_40 = df_3[df_3["version"]=="gate_40"][["sum_gamerounds"]]

for test_name in test_names:
    _, p_value_levene = stats.levene(version_30[test_name], version_40[test_name])
    if p_value_levene > 0.05:
        print(f"{test_name} p-value: {p_value_levene}, 등분산 가정 만족")
    else:
        print(f"{test_name} p-value: {p_value_levene}, 이분산 가정 만족")

In [None]:
t_statistic, p_value = stats.ttest_ind(
    a=version_30,
    b=version_40,
    alternative="greater",
    equal_var=True
)
print(f"p-value: {p_value}")
print(f"귀무 가설 기각: {p_value < 0.05}")

In [None]:
retention_1 = pd.crosstab(df['version'],df['retention_1'])

retention_7 = pd.crosstab(df['version'],df['retention_7'])

In [None]:
retention_1

In [None]:
retention_7

In [None]:
chi2_statistics,pvalue,_,_=chi2_contingency(retention_1)

print(f'카이제곱 통계량 : {chi2_statistics}')
print(f'p value : {pvalue}')

In [None]:
chi2_statistics,pvalue,_,_=chi2_contingency(retention_7)

print(f'카이제곱 통계량 : {chi2_statistics}')
print(f'p value : {pvalue}')

In [None]:
a = df[df['version'] == 'gate_30']['retention_7']

In [None]:
b = df[df['version'] == 'gate_40']['retention_7']

In [None]:
plt.figure(figsize=(12,5))
colors = ['lightskyblue', 'lightcoral']
color = ['lightskyblue','lightcoral']

plt.subplot(1,2,1)
plt.pie(a.value_counts(),colors=colors,labels=a.unique(),autopct='%1.1f%%', startangle=140)
plt.title('Retention 7 Ratio for Gate 30')

plt.subplot(1,2,2)
plt.pie(b.value_counts(),colors=color,labels=b.unique(),autopct='%1.1f%%', startangle=140)
plt.title('Retention 7 Ratio for Gate 40')

In [None]:
# gamerounds 상위 25%


df_4 = df.sort_values(by="sum_gamerounds",ascending=False,ignore_index=True)[:22545]
df_4.groupby(["version"])[["sum_gamerounds"]].mean()
test_names = ["sum_gamerounds"]

version_30 = df_4[df_4["version"] =="gate_30" ][["sum_gamerounds"]]
version_40 = df_4[df_4["version"]=="gate_40"][["sum_gamerounds"]]

for test_name in test_names:
    _, p_value_levene = stats.levene(version_30[test_name], version_40[test_name])
    if p_value_levene > 0.05:
        print(f"{test_name} p-value: {p_value_levene}, 등분산 가정 만족")
    else:
        print(f"{test_name} p-value: {p_value_levene}, 이분산 가정 만족")

In [None]:
# T-5-5

t_statistic, p_value = stats.ttest_ind(
    a=version_30,
    b=version_40,
    alternative="greater",
    equal_var=True
)
print(f"p-value: {p_value}")
print(f"귀무 가설 기각: {p_value < 0.05}")

In [None]:
df_4.groupby(["version"])[["sum_gamerounds"]].mean()

The END