In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = "addiction_population_data.csv"
df = pd.read_csv(file_path)

# --- 1. Initial Data Inspection (Recap) ---
print("--- Initial Data Inspection Summary ---")
print(f"Shape of the dataframe: {df.shape}")
print("Data types and non-null counts:")
# df.info() # Already printed in detail, concise summary here.
print(df.dtypes.value_counts())
print(f"Missing values total: {df.isnull().sum().sum()}")
print(f"Duplicate rows: {df.duplicated().sum()}")

# --- 2. Clean Data (Confirmation, as data seems clean) ---
print("\n\n--- Data Cleaning ---")

print("Data is already clean (no missing values, no duplicates found in initial inspection).")
print("Proceeding with analysis using available columns.")

# --- 3. Prove Data Cleanliness and Size ---
print("\n\n--- Proof of Data Cleanliness and Size ---")
print("Shape of the cleaned dataframe (rows, columns):")
print(df.shape)
print("\nInformation about the cleaned dataframe (dtypes, non-null counts):")
df.info() # Full info for proof
print("\nMissing values per column after final check:")
print(df.isnull().sum())

if df.shape[0] >= 100:
    print(f"\nThe dataset has {df.shape[0]} data points, which is >= 100.")
else:
    print(f"\nThe dataset has {df.shape[0]} data points, which is < 100.")

# --- 4. NumPy Operations ---
print("\n\n--- NumPy Operations ---")
# Using actual available numeric columns
arr_age = df['age'].values
arr_income = df['annual_income_usd'].values
arr_smokes_per_day = df['smokes_per_day'].values
arr_bmi = df['bmi'].values
arr_drinks_per_week = df['drinks_per_week'].values

# Op 1: Calculate the mean age
mean_age = np.mean(arr_age)
print(f"1. Mean Age: {mean_age:.2f} years")

# Op 2: Calculate the median annual income
median_income = np.median(arr_income)
print(f"2. Median Annual Income (USD): {median_income:.2f}")

# Op 3: Calculate the total number of smokes per day recorded across all entries
total_smokes = np.sum(arr_smokes_per_day)
print(f"3. Total Smokes Per Day (sum across all individuals): {total_smokes}")

# Op 4: Calculate the standard deviation of BMI
std_dev_bmi = np.std(arr_bmi)
print(f"4. Standard Deviation of BMI: {std_dev_bmi:.2f}")

# Op 5: Calculate the 75th percentile of drinks per week
percentile_75_drinks = np.percentile(arr_drinks_per_week, 75)
print(f"5. 75th Percentile of Drinks Per Week: {percentile_75_drinks:.2f}")

# --- 5. SciPy Operation ---
print("\n\n--- SciPy Operation ---")
if df['smokes_per_day'].nunique() > 1 and df['attempts_to_quit_smoking'].nunique() > 1:
    correlation_smoke_quit, p_value_smoke_quit = stats.pearsonr(df['smokes_per_day'], df['attempts_to_quit_smoking'])
    print(f"1. Pearson correlation between 'smokes_per_day' and 'attempts_to_quit_smoking': {correlation_smoke_quit:.4f} (p-value: {p_value_smoke_quit:.4g})")
else:
    print("Could not calculate Pearson correlation due to insufficient variance in data.")

# --- 6. Statsmodels Operation (OLS Regression) ---
print("\n\n--- Statsmodels Operation (OLS Regression) ---")
Y = df['bmi']
X = df[['smokes_per_day', 'sleep_hours']]
X = sm.add_constant(X)

model_bmi = sm.OLS(Y, X)
results_bmi = model_bmi.fit()
print("OLS Regression: Predicting BMI based on Smokes per Day and Sleep Hours")
print(results_bmi.summary())

# --- 7. Visualizations ---
print("\n\n--- Visualizations ---")
plt.style.use('seaborn-v0_8-whitegrid')

# Visualization 1: Histogram of 'age'
plt.figure(figsize=(10, 6))
sns.histplot(df['age'], kde=True, bins=20)
plt.title('Distribution of Age')
plt.xlabel('Age (years)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig("age_distribution.png")
print("Saved age_distribution.png")

# Visualization 2: Box plot of 'annual_income_usd' by 'gender'
if df['gender'].nunique() > 1 and df['gender'].nunique() < 10:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='gender', y='annual_income_usd', data=df)
    plt.title('Annual Income Distribution by Gender')
    plt.xlabel('Gender')
    plt.ylabel('Annual Income (USD)')
    plt.ticklabel_format(style='plain', axis='y')
    plt.tight_layout()
    plt.savefig("income_by_gender_boxplot.png")
    print("Saved income_by_gender_boxplot.png")
    # plt.show()
else:
    print("Skipping 'Annual Income by Gender' boxplot: 'gender' column not suitable or too many categories.")
    plt.figure(figsize=(10, 6))
    sns.boxplot(y=df['annual_income_usd'])
    plt.title('Distribution of Annual Income (USD)')
    plt.ylabel('Annual Income (USD)')
    plt.ticklabel_format(style='plain', axis='y')
    plt.tight_layout()
    plt.savefig("annual_income_distribution.png")
    print("Saved annual_income_distribution.png (fallback).")


# Visualization 3: Correlation plot with a regression line: 'smokes_per_day' vs 'bmi'
if df['smokes_per_day'].nunique() > 1 and df['bmi'].nunique() > 1:
    lm = sns.lmplot(x='smokes_per_day', y='bmi', data=df,
                    line_kws={'color': 'red'}, scatter_kws={'alpha':0.5})
    lm.set_axis_labels("Smokes Per Day", "BMI")
    plt.title('Correlation: Smokes Per Day vs. BMI with Regression Line')
    fig = lm.fig 
    fig.suptitle('Correlation: Smokes Per Day vs. BMI with Regression Line', y=1.03)
    plt.tight_layout()
    plt.savefig("correlation_smokes_bmi.png")
    print("Saved correlation_smokes_bmi.png")
else:
    print("Skipping lmplot for 'smokes_per_day' vs 'bmi': Not enough unique data points.")
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='smokes_per_day', y='bmi', data=df, alpha=0.6)
    plt.title('Scatter Plot: Smokes Per Day vs. BMI')
    plt.xlabel('Smokes Per Day')
    plt.ylabel('BMI')
    plt.tight_layout()
    plt.savefig("scatter_smokes_bmi.png")
    print("Saved scatter_smokes_bmi.png as fallback.")


# Additional Visualization: 'drinks_per_week' vs 'sleep_hours'
if df['drinks_per_week'].nunique() > 1 and df['sleep_hours'].nunique() > 1:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='drinks_per_week', y='sleep_hours', data=df, alpha=0.6, hue='gender' if df['gender'].nunique() < 10 else None)
    plt.title('Drinks Per Week vs. Sleep Hours')
    plt.xlabel('Drinks Per Week')
    plt.ylabel('Sleep Hours')
    plt.tight_layout()
    plt.savefig("drinks_vs_sleep.png")
    print("Saved drinks_vs_sleep.png")
else:
    print("Could not generate Drinks vs Sleep scatter plot due to data issues.")



FileNotFoundError: [Errno 2] No such file or directory: 'addiction_population_data.csv'