In [9]:
import pandas as pd 
import numpy as np 
import plotly.express as px

In [10]:
df = pd.read_csv('DailyDelhiClimateTrain.csv')
df.head()


Unnamed: 0,date,meantemp,humidity,wind_speed,meanpressure
0,2013-01-01,10.0,84.5,0.0,1015.666667
1,2013-01-02,7.4,92.0,2.98,1017.8
2,2013-01-03,7.166667,87.0,4.633333,1018.666667
3,2013-01-04,8.666667,71.333333,1.233333,1017.166667
4,2013-01-05,6.0,86.833333,3.7,1016.5


In [11]:
print("Number of rows in the dataset:", len(df))
print("Number of columns in the dataset:", len(df.columns))
print("Number of null values in the dataset:", df.isnull().sum())
print("Number of duplicate rows in the dataset:", df.duplicated().sum())

Number of rows in the dataset: 1462
Number of columns in the dataset: 5
Number of null values in the dataset: date            0
meantemp        0
humidity        0
wind_speed      0
meanpressure    0
dtype: int64
Number of duplicate rows in the dataset: 0


In [12]:
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

cols = ['date', 'year', 'month', 'day', 'meantemp', 'humidity', 'wind_speed', 'meanpressure']
df = df[cols]

df.head()

Unnamed: 0,date,year,month,day,meantemp,humidity,wind_speed,meanpressure
0,2013-01-01,2013,1,1,10.0,84.5,0.0,1015.666667
1,2013-01-02,2013,1,2,7.4,92.0,2.98,1017.8
2,2013-01-03,2013,1,3,7.166667,87.0,4.633333,1018.666667
3,2013-01-04,2013,1,4,8.666667,71.333333,1.233333,1017.166667
4,2013-01-05,2013,1,5,6.0,86.833333,3.7,1016.5


In [13]:
# Calculate 14-day rolling average
df['rolling_14_day'] = df['meantemp'].rolling(window=14, center=True).mean()

# Create the plot with both daily data and rolling average
fig = px.line(df, x='date', y='meantemp', 
              title='Delhi Mean Temperature Over Time with 14-Day Rolling Average',
              labels={'date': 'Year', 'meantemp': 'Mean Temperature'})

# Add the rolling average as a second line
fig.add_scatter(x=df['date'], y=df['rolling_14_day'], 
                mode='lines', 
                name='14-Day Rolling Average',
                line=dict(color='orange', width=4))

# Update the original line properties
fig.data[0].update(name='Daily Temperature', 
                   line=dict(color='darkblue', width=0.5))

# Update x-axis to show only years
fig.update_xaxes(
    dtick="M12",  # Show ticks every 12 months (yearly)
    tickformat="%Y"  # Format as year only
)

fig.update_layout(
    xaxis_title="Year",
    yaxis_title="Mean Temperature (°C)",
    hovermode='x unified',
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01
    )
)

fig.show()

In [14]:
import scipy.stats as stats

# Calculate yearly statistics including confidence intervals
yearly_stats = df.groupby('year')['meantemp'].agg(['mean', 'std', 'count']).reset_index()
yearly_stats = yearly_stats[yearly_stats['year'] <= 2016]

# Calculate 95% confidence intervals
confidence_level = 0.95
alpha = 1 - confidence_level

# Calculate standard error and confidence intervals for each year
yearly_stats['std_error'] = yearly_stats['std'] / np.sqrt(yearly_stats['count'])
yearly_stats['t_critical'] = yearly_stats.apply(
    lambda row: stats.t.ppf(1 - alpha/2, df=row['count']-1), axis=1
)
yearly_stats['margin_error'] = yearly_stats['t_critical'] * yearly_stats['std_error']
yearly_stats['ci_lower'] = yearly_stats['mean'] - yearly_stats['margin_error']
yearly_stats['ci_upper'] = yearly_stats['mean'] + yearly_stats['margin_error']

print(yearly_stats[['year', 'mean', 'std', 'count', 'ci_lower', 'ci_upper']])

# Create bar chart with confidence intervals
fig = px.bar(yearly_stats, x='year', y='mean', 
             title='Yearly Average Mean Temperature in Delhi with 95% Confidence Intervals',
             labels={'year': 'Year', 'mean': 'Average Mean Temperature (°C)'},
             error_y='margin_error')  # This adds the error bars

fig.update_layout(
    xaxis_title="Year",
    yaxis_title="Average Mean Temperature (°C)",
    hovermode='x unified'
)

fig.update_xaxes(
    dtick="Y1", 
    tickformat="%Y"
)

# Customize error bar appearance
fig.update_traces(
    error_y=dict(
        type='data',
        array=yearly_stats['margin_error'],
        color='black',
        thickness=2,
        width=5
    )
)

fig.show()

# Print detailed statistics
print("\nDetailed Statistics:")
for _, row in yearly_stats.iterrows():
    print(f"Year {int(row['year'])}: Mean = {row['mean']:.2f}°C, "
          f"95% CI = [{row['ci_lower']:.2f}, {row['ci_upper']:.2f}]°C, "
          f"n = {int(row['count'])}")

   year       mean       std  count   ci_lower   ci_upper
0  2013  24.791494  7.409195    365  24.028855  25.554133
1  2014  25.010673  7.597334    365  24.228668  25.792677
2  2015  25.114591  7.237806    365  24.369594  25.859589
3  2016  27.103373  6.888469    366  26.395309  27.811437



Detailed Statistics:
Year 2013: Mean = 24.79°C, 95% CI = [24.03, 25.55]°C, n = 365
Year 2014: Mean = 25.01°C, 95% CI = [24.23, 25.79]°C, n = 365
Year 2015: Mean = 25.11°C, 95% CI = [24.37, 25.86]°C, n = 365
Year 2016: Mean = 27.10°C, 95% CI = [26.40, 27.81]°C, n = 366


In [15]:
df['yearly_avg'] = df.groupby('year')['meantemp'].transform('mean')

# Calculate correlation matrix
numeric_cols = ['meantemp', 'yearly_avg', 'humidity', 'wind_speed', 'meanpressure', 'year', 'month']
corr_matrix = df[numeric_cols].corr()

# Create more readable column names for display
display_names = {
    'meantemp': 'Mean Temp',
    'yearly_avg': 'Yearly Avg',
    'humidity': 'Humidity', 
    'wind_speed': 'Wind Speed',
    'meanpressure': 'Mean Pressure',
    'year': 'Year',
    'month': 'Month'
}

# Rename for display
corr_display = corr_matrix.rename(index=display_names, columns=display_names)

# Create heatmap
fig = px.imshow(corr_display,
                text_auto='.2f',  # Show 2 decimal places
                aspect="auto",
                title="Weather Variables Correlation Matrix",
                color_continuous_scale='RdBu_r',
                range_color=[-1, 1])  # Set color range from -1 to 1

# Update layout
fig.update_layout(
    title_x=0.5,
    width=700,
    height=600,
    font=dict(size=12)
)

# Update colorbar
fig.update_coloraxes(
    colorbar=dict(
        tickmode="linear",
        tick0=-1,
        dtick=0.5
    )
)

fig.show()

In [16]:
from sklearn.metrics import r2_score
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Create subplots
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Mean Temperature vs Humidity', 'Mean Temperature vs Wind Speed')
)

# Scatter with trendline for humidity
fig.add_trace(
    go.Scatter(x=df['humidity'], y=df['meantemp'],
               mode='markers',
               name='',
               marker=dict(size=3, opacity=0.6, color='steelblue')),
    row=1, col=1
)

# Add trendline for humidity and calculate R²
z = np.polyfit(df['humidity'], df['meantemp'], 1)
p = np.poly1d(z)
y_pred_humidity = p(df['humidity'])
r2_humidity = r2_score(df['meantemp'], y_pred_humidity)

fig.add_trace(
    go.Scatter(x=df['humidity'], y=p(df['humidity']),
               mode='lines',
               name=f'Trend (R² = {r2_humidity:.3f})',
               line=dict(color='red', width=2)),
    row=1, col=1
)

# Scatter with trendline for wind_speed
fig.add_trace(
    go.Scatter(x=df['wind_speed'], y=df['meantemp'],
               mode='markers',
               name='',
               marker=dict(size=3, opacity=0.6, color='orange')),
    row=1, col=2
)

# Add trendline for wind_speed and calculate R²
z2 = np.polyfit(df['wind_speed'], df['meantemp'], 1)
p2 = np.poly1d(z2)
y_pred_wind = p2(df['wind_speed'])
r2_wind = r2_score(df['meantemp'], y_pred_wind)

fig.add_trace(
    go.Scatter(x=df['wind_speed'], y=p2(df['wind_speed']),
               mode='lines',
               name=f'Trend (R² = {r2_wind:.3f})',
               line=dict(color='purple', width=2)),
    row=1, col=2
)

fig.update_xaxes(
    title_text="Humidity (%)",
    title_font=dict(size=12),
    tickfont=dict(size=10),
    row=1, col=1
)

fig.update_xaxes(
    title_text="Wind Speed (km/h)",
    title_font=dict(size=12), 
    tickfont=dict(size=10),
    row=1, col=2
)

fig.update_yaxes(
    title_text="Mean Temperature (°C)",
    title_font=dict(size=12),
    tickfont=dict(size=10)
)

fig.update_layout(
    title="Pairplot with Trend Lines and R² Values",
    height=500,
    showlegend=True
)

fig.show()

# Print the R² values
print(f"R² for Humidity vs Mean Temperature: {r2_humidity:.4f}")
print(f"R² for Wind Speed vs Mean Temperature: {r2_wind:.4f}")

R² for Humidity vs Mean Temperature: 0.3271
R² for Wind Speed vs Mean Temperature: 0.0939
