# Basic Stats

This utility describes some basic stats operations for describing and analyzing data. 

1. import libraries
2. import the .csv file as a dataframe
3. remove missing values
4. 

In [21]:
# Import Libraries
import pandas as pd
import scipy.stats as stats # for stats
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt # for plotting
# import seaborn as sns # also for plotting

In [5]:
# Load the .csv file and convert to a dataframe
file_path = "DG_Sim_Data.csv"
df = pd.read_csv(file_path)

# Display the first 5 rows
df.head()

Unnamed: 0,age,height,gender,is_active,signup_date
0,37.0,,female,False,2020-04-27
1,35.0,180.86,female,False,
2,63.0,176.8,male,True,2021-01-29
3,64.0,191.69,other,True,2023-01-17
4,27.0,161.93,female,True,2016-02-03


In [9]:
# Clean the data (remove NaNs)
df_clean= df.dropna()
df_clean.head()

Unnamed: 0,age,height,gender,is_active,signup_date
2,63.0,176.8,male,True,2021-01-29
3,64.0,191.69,other,True,2023-01-17
4,27.0,161.93,female,True,2016-02-03
5,43.0,176.31,female,False,2017-11-10
6,54.0,176.78,other,True,2025-04-24


In [11]:
# describe the data; note only works for the numerical data...
df_clean.describe()

Unnamed: 0,age,height
count,155.0,155.0
mean,40.890323,171.000839
std,13.198075,10.123093
min,18.0,146.85
25%,29.5,164.425
50%,41.0,170.6
75%,51.5,177.46
max,64.0,199.09


In [29]:
# Can look as some specific columns stats if desired
df_clean['age'].median()

41.0

In [15]:
# Some other stuff as well like take the sum
df_clean['age'].sum()

6338.0

In [36]:
# Use the median age to split the data set (gives us two groups from the same type of measurement for some stats tests)...
median_val= df_clean['age'].median()
print(f' the median value is {median_val}')

less_than= df_clean[df_clean['age'] <= median_val]
print(less_than)
greater_than= df_clean[df_clean['age'] > median_val]
print(greater_than)

 the median value is 41.0
      age  height  gender is_active signup_date
4    27.0  161.93  female      True  2016-02-03
7    32.0  174.37  female      True  2023-11-21
8    26.0  149.44   other      True  2019-07-28
10   34.0  189.72  female      True  2020-10-06
14   34.0  174.15   other      True  2023-02-12
..    ...     ...     ...       ...         ...
192  22.0  188.98   other      True  2023-03-27
193  37.0  146.85   other     False  2016-09-17
194  27.0  180.64   other     False  2023-10-12
195  40.0  168.79   other      True  2024-07-23
199  23.0  166.29    male      True  2018-01-17

[81 rows x 5 columns]
      age  height  gender is_active signup_date
2    63.0  176.80    male      True  2021-01-29
3    64.0  191.69   other      True  2023-01-17
5    43.0  176.31  female     False  2017-11-10
6    54.0  176.78   other      True  2025-04-24
11   51.0  164.47  female     False  2019-06-14
..    ...     ...     ...       ...         ...
181  46.0  164.45  female      True  20

In [38]:
# Do a t-test using the two groups
x1= less_than['age']
x2= greater_than['age']

# Welch's t-test
t_stat, p_val= ttest_ind(x1, x2, equal_var= False)

print(f' the t-statistic is: {t_stat: .4f}')
print(f' the p-value is {p_val: .4f}')

 the t-statistic is: -19.4577
 the p-value is  0.0000
