# Exploratory Data Analysis and Hypothesis Testing

This notebook investigates whether certain daily habits—specifically gaming time and sleep duration—have a significant impact on the number of cigarettes smoked while gaming.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import statsmodels.api as sm

In [None]:
# Load the data
file_path = 'completed_project_data.xlsx'
df = pd.read_excel(file_path)
df.rename(columns={'Game Hours': 'GameHours', 'cigarettes smoked': 'Cigarettes'}, inplace=True)
df.head()

In [None]:
df.describe()

In [None]:
# Correlation matrix
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Histogram of Cigarettes
sns.histplot(df['Cigarettes'], kde=True)
plt.title('Distribution of Cigarettes Smoked While Gaming')
plt.show()

In [None]:
# Scatter plot: GameHours vs Cigarettes
sns.regplot(x='GameHours', y='Cigarettes', data=df)
plt.title('Game Hours vs Cigarettes')
plt.show()

In [None]:
# Scatter plot: SleepHours vs Cigarettes
sns.regplot(x='SleepHours', y='Cigarettes', data=df)
plt.title('Sleep Hours vs Cigarettes')
plt.show()

## Hypothesis 1: More gaming leads to more cigarettes smoked

In [None]:
corr1, p1 = pearsonr(df['GameHours'], df['Cigarettes'])
print(f'Hypothesis 1 - Pearson r: {corr1:.2f}, p-value: {p1:.4g}')

In [None]:
X1 = sm.add_constant(df['GameHours'])
model1 = sm.OLS(df['Cigarettes'], X1).fit()
print(model1.summary())

## Hypothesis 2: Sleeping less leads to more cigarettes smoked

In [None]:
corr2, p2 = pearsonr(df['SleepHours'], df['Cigarettes'])
print(f'Hypothesis 2 - Pearson r: {corr2:.2f}, p-value: {p2:.4g}')

In [None]:
X2 = sm.add_constant(df['SleepHours'])
model2 = sm.OLS(df['Cigarettes'], X2).fit()
print(model2.summary())