In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("/content/Traffic_Flow.csv")

In [3]:
df.head()

Unnamed: 0,site,day,date,start_time,end_time,flow,flow_pc,cong,cong_pc,dsat,dsat_pc,ObjectId
0,N01111A,TH,05/01/2023,2023/07/04 08:30:00+00,09:45,0,0,0,0,50,98,1
1,N01111A,WE,04/01/2023,2023/07/04 07:30:00+00,08:45,300,100,0,100,22,98,2
2,N01111A,FR,06/01/2023,2023/07/04 09:30:00+00,10:45,445,100,0,100,31,100,3
3,N01111A,TU,03/01/2023,2023/07/04 06:30:00+00,07:45,0,0,0,0,11,32,4
4,N01111A,TH,05/01/2023,2023/07/04 08:45:00+00,10:00,0,0,0,0,50,107,5


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 12 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   site        1048575 non-null  object
 1   day         1048575 non-null  object
 2   date        1048575 non-null  object
 3   start_time  1048575 non-null  object
 4   end_time    1048575 non-null  object
 5   flow        1048575 non-null  int64 
 6   flow_pc     1048575 non-null  int64 
 7   cong        1048575 non-null  int64 
 8   cong_pc     1048575 non-null  int64 
 9   dsat        1048575 non-null  int64 
 10  dsat_pc     1048575 non-null  int64 
 11  ObjectId    1048575 non-null  int64 
dtypes: int64(7), object(5)
memory usage: 96.0+ MB


In [6]:
# Summary statistics for numerical columns
print(df.describe())

               flow       flow_pc          cong       cong_pc          dsat  \
count  1.048575e+06  1.048575e+06  1.048575e+06  1.048575e+06  1.048575e+06   
mean   1.330048e+02  6.874974e+01  5.719858e-01  6.874974e+01  2.428677e+01   
std    2.100803e+02  4.622401e+01  3.554286e+00  4.622401e+01  3.343699e+01   
min    0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
25%    0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
50%    1.600000e+01  1.000000e+02  0.000000e+00  1.000000e+02  1.100000e+01   
75%    1.910000e+02  1.000000e+02  0.000000e+00  1.000000e+02  4.100000e+01   
max    2.142000e+03  2.000000e+02  8.900000e+01  2.000000e+02  1.356000e+03   

            dsat_pc      ObjectId  
count  1.048575e+06  1.048575e+06  
mean   7.088241e+01  5.242880e+05  
std    4.278320e+01  3.026977e+05  
min    0.000000e+00  1.000000e+00  
25%    2.400000e+01  2.621445e+05  
50%    9.700000e+01  5.242880e+05  
75%    1.000000e+02  7.864315e+05  


In [7]:
# Check for missing values
print(df.isnull().sum())

site          0
day           0
date          0
start_time    0
end_time      0
flow          0
flow_pc       0
cong          0
cong_pc       0
dsat          0
dsat_pc       0
ObjectId      0
dtype: int64


In [22]:
from scipy.stats import ttest_ind

# Check variability in 'flow' for two specific sites (replace site_1 and site_2 with actual site names)
site_1_data = df[df['site'] == 'site_1']['flow']
site_2_data = df[df['site'] == 'site_2']['flow']

print(f"Site 1 Flow Variability: {site_1_data.var()}")
print(f"Site 2 Flow Variability: {site_2_data.var()}")

# T-test
t_statistic, p_value = ttest_ind(site_1_data, site_2_data, nan_policy='omit')
print(f'T-Statistic: {t_statistic}, P-Value: {p_value}')


Site 1 Flow Variability: nan
Site 2 Flow Variability: nan
T-Statistic: nan, P-Value: nan


## encountering challenges with the T-test due to issues with variability in the 'flow' data for specific sites, or if you're looking for alternative techniques

# Kruskal-Wallis H Test:

This is a non-parametric test that can be used to compare three or more groups when the assumptions of normality and equal variances are not met.

In [24]:
from scipy.stats import kruskal

# Assuming there are more than two sites
site_3_data = df[df['site'] == 'site_3']['flow']

# Kruskal-Wallis H Test
H_statistic, p_value_kruskal = kruskal(site_1_data, site_2_data, site_3_data)
print(f'Kruskal-Wallis H Statistic: {H_statistic}, P-Value: {p_value_kruskal}')


Kruskal-Wallis H Statistic: nan, P-Value: nan


## Spearman

In [25]:
from scipy.stats import spearmanr

# Spearman's Rank Correlation
correlation_coefficient, p_value_spearman = spearmanr(df['flow'], df['cong_pc'])
print(f'Spearman Rank Correlation Coefficient: {correlation_coefficient}, P-Value: {p_value_spearman}')


Spearman Rank Correlation Coefficient: 0.6936086610371052, P-Value: 0.0


## ANOVA

In [27]:
from scipy.stats import f_oneway

# Assuming there are more than two sites
site_3_data = df[df['site'] == 'site_3']['flow']

# ANOVA
F_statistic, p_value_anova = f_oneway(site_1_data, site_2_data, site_3_data)
print(f'ANOVA F-Statistic: {F_statistic}, P-Value: {p_value_anova}')


ANOVA F-Statistic: nan, P-Value: nan


