Data Setup & Preprocessing

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("GlobalWeatherRepository.csv")
print("Shape:", df.shape)
df.head()

Shape: (60218, 41)


Unnamed: 0,country,location_name,latitude,longitude,timezone,last_updated_epoch,last_updated,temperature_celsius,temperature_fahrenheit,condition_text,...,air_quality_PM2.5,air_quality_PM10,air_quality_us-epa-index,air_quality_gb-defra-index,sunrise,sunset,moonrise,moonset,moon_phase,moon_illumination
0,Afghanistan,Kabul,34.52,69.18,Asia/Kabul,1715849100,2024-05-16 13:15,26.6,79.8,Partly Cloudy,...,8.4,26.6,1,1,04:50 AM,06:50 PM,12:12 PM,01:11 AM,Waxing Gibbous,55
1,Albania,Tirana,41.33,19.82,Europe/Tirane,1715849100,2024-05-16 10:45,19.0,66.2,Partly cloudy,...,1.1,2.0,1,1,05:21 AM,07:54 PM,12:58 PM,02:14 AM,Waxing Gibbous,55
2,Algeria,Algiers,36.76,3.05,Africa/Algiers,1715849100,2024-05-16 09:45,23.0,73.4,Sunny,...,10.4,18.4,1,1,05:40 AM,07:50 PM,01:15 PM,02:14 AM,Waxing Gibbous,55
3,Andorra,Andorra La Vella,42.5,1.52,Europe/Andorra,1715849100,2024-05-16 10:45,6.3,43.3,Light drizzle,...,0.7,0.9,1,1,06:31 AM,09:11 PM,02:12 PM,03:31 AM,Waxing Gibbous,55
4,Angola,Luanda,-8.84,13.23,Africa/Luanda,1715849100,2024-05-16 09:45,26.0,78.8,Partly cloudy,...,183.4,262.3,5,10,06:12 AM,05:55 PM,01:17 PM,12:38 AM,Waxing Gibbous,55


In [None]:
# Overview of missing data
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
print("Missing Values:\n", missing)

Missing Values:
 Series([], dtype: int64)


Removing Outliers

In [5]:
# Example: Drop columns with >40% missing
threshold = 0.4
df = df.loc[:, df.isnull().mean() < threshold]

# Example: Fill numeric columns with median
num_cols = df.select_dtypes(include='number').columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# For categorical (if any)
cat_cols = df.select_dtypes(include='object').columns
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

# Detect and optionally remove outliers in key numeric columns
Q1 = df[num_cols].quantile(0.25)
Q3 = df[num_cols].quantile(0.75)
IQR = Q3 - Q1

# Filtering out outliers
df_no_outliers = df[~((df[num_cols] < (Q1 - 1.5 * IQR)) | (df[num_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]
print("Shape after removing outliers:", df_no_outliers.shape)


Shape after removing outliers: (25425, 41)


In [7]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Choose method
scaler = StandardScaler()  # or MinMaxScaler()

df_scaled = df_no_outliers.copy()
df_scaled[num_cols] = scaler.fit_transform(df_no_outliers[num_cols])

# Print the normalized numerical columns (first 10 rows)
print("Normalized Data (first 10 rows):")
print(df_scaled[num_cols].head(10))


Normalized Data (first 10 rows):
    latitude  longitude  last_updated_epoch  temperature_celsius  \
0   0.626613   1.244499           -1.575634             0.334946   
5  -0.054338  -1.443156           -1.575634             0.259042   
6  -2.078017  -1.377929           -1.575634            -2.018102   
9   1.161982   0.161273           -1.575634            -1.006038   
10  0.856728   0.848622           -1.575634            -0.879530   
11  0.257178  -1.761088           -1.575634             0.385550   
14 -0.211662  -1.397415           -1.575634             0.512058   
15  1.385053   0.391005           -1.575634            -0.879530   
17 -0.049251  -1.995333           -1.575634             0.259042   
18 -0.470736  -0.120764           -1.575634             0.512058   

    temperature_fahrenheit  wind_mph  wind_kph  wind_degree  pressure_mb  \
0                 0.329214 -0.009141 -0.017369     1.665337    -0.401891   
5                 0.258931 -0.586182 -0.588599    -0.729325    -0.