In [None]:
# -----------------------------
# Dataset Cleaning Template
# -----------------------------

# 1. Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# -----------------------------
# 2. Load the dataset
df = pd.read_csv(r"C:\Users\jsrv7\Downloads\archive\GlobalWeatherRepository.csv")

# Preview the first few rows
print("First 5 rows of dataset:")
display(df.head())

# -----------------------------
# 3. Explore the dataset
print("\nDataset Info:")
df.info()

print("\nSummary Statistics:")
display(df.describe())

print("\nMissing Values per Column:")
print(df.isnull().sum())

print("\nNumber of duplicate rows:", df.duplicated().sum())

# -----------------------------
# 4. Handle Missing Values
# Option 1: Drop rows with missing data
# df.dropna(inplace=True)

# Option 2: Fill missing values
for col in df.select_dtypes(include=np.number).columns:
    df[col].fillna(df[col].mean(), inplace=True)  # Fill numeric columns with mean

for col in df.select_dtypes(include='object').columns:
    df[col].fillna('Unknown', inplace=True)       # Fill categorical columns with 'Unknown'

# -----------------------------
# 5. Remove Duplicates
df.drop_duplicates(inplace=True)

# -----------------------------
# 6. Convert Data Types (example)

# -----------------------------
# 7. Normalize / Scale numeric columns (optional)
scaler = MinMaxScaler()
numeric_cols = df.select_dtypes(include=np.number).columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# -----------------------------
# 8. Handle Outliers (optional, using IQR method)
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    df = df[~((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR)))]

# -----------------------------
# 9. Save cleaned dataset
df.to_csv("cleaned_dataset.csv", index=False)
print("\nCleaned dataset saved as 'cleaned_dataset.csv'")

# -----------------------------
# 10. Final check
print("\nCleaned Dataset Info:")
df.info()


First 5 rows of dataset:


Unnamed: 0,country,location_name,latitude,longitude,timezone,last_updated_epoch,last_updated,temperature_celsius,temperature_fahrenheit,condition_text,...,air_quality_PM2.5,air_quality_PM10,air_quality_us-epa-index,air_quality_gb-defra-index,sunrise,sunset,moonrise,moonset,moon_phase,moon_illumination
0,Afghanistan,Kabul,34.52,69.18,Asia/Kabul,1715849100,2024-05-16 13:15,26.6,79.8,Partly Cloudy,...,8.4,26.6,1,1,04:50 AM,06:50 PM,12:12 PM,01:11 AM,Waxing Gibbous,55
1,Albania,Tirana,41.33,19.82,Europe/Tirane,1715849100,2024-05-16 10:45,19.0,66.2,Partly cloudy,...,1.1,2.0,1,1,05:21 AM,07:54 PM,12:58 PM,02:14 AM,Waxing Gibbous,55
2,Algeria,Algiers,36.76,3.05,Africa/Algiers,1715849100,2024-05-16 09:45,23.0,73.4,Sunny,...,10.4,18.4,1,1,05:40 AM,07:50 PM,01:15 PM,02:14 AM,Waxing Gibbous,55
3,Andorra,Andorra La Vella,42.5,1.52,Europe/Andorra,1715849100,2024-05-16 10:45,6.3,43.3,Light drizzle,...,0.7,0.9,1,1,06:31 AM,09:11 PM,02:12 PM,03:31 AM,Waxing Gibbous,55
4,Angola,Luanda,-8.84,13.23,Africa/Luanda,1715849100,2024-05-16 09:45,26.0,78.8,Partly cloudy,...,183.4,262.3,5,10,06:12 AM,05:55 PM,01:17 PM,12:38 AM,Waxing Gibbous,55



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97435 entries, 0 to 97434
Data columns (total 41 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       97435 non-null  object 
 1   location_name                 97435 non-null  object 
 2   latitude                      97435 non-null  float64
 3   longitude                     97435 non-null  float64
 4   timezone                      97435 non-null  object 
 5   last_updated_epoch            97435 non-null  int64  
 6   last_updated                  97435 non-null  object 
 7   temperature_celsius           97435 non-null  float64
 8   temperature_fahrenheit        97435 non-null  float64
 9   condition_text                97435 non-null  object 
 10  wind_mph                      97435 non-null  float64
 11  wind_kph                      97435 non-null  float64
 12  wind_degree                   97435 non-null 

Unnamed: 0,latitude,longitude,last_updated_epoch,temperature_celsius,temperature_fahrenheit,wind_mph,wind_kph,wind_degree,pressure_mb,pressure_in,...,gust_kph,air_quality_Carbon_Monoxide,air_quality_Ozone,air_quality_Nitrogen_dioxide,air_quality_Sulphur_dioxide,air_quality_PM2.5,air_quality_PM10,air_quality_us-epa-index,air_quality_gb-defra-index,moon_illumination
count,97435.0,97435.0,97435.0,97435.0,97435.0,97435.0,97435.0,97435.0,97435.0,97435.0,...,97435.0,97435.0,97435.0,97435.0,97435.0,97435.0,97435.0,97435.0,97435.0,97435.0
mean,19.146115,22.112882,1737542000.0,22.788837,73.021634,8.241385,13.266501,170.863827,1013.96765,29.941811,...,18.555623,517.522988,62.769735,15.819119,11.268231,26.167342,53.765763,1.760979,2.7622,49.699636
std,24.457297,65.819599,12522290.0,8.893859,16.008748,7.881144,12.681089,102.719479,11.381238,0.336032,...,14.682107,844.567024,32.260021,26.074344,40.805656,40.829209,164.16751,0.981699,2.567807,35.089898
min,-41.3,-175.2,1715849000.0,-24.9,-12.8,2.2,3.6,1.0,947.0,27.96,...,3.6,-9999.0,0.0,0.0,-9999.0,0.168,-1848.15,1.0,1.0,0.0
25%,3.75,-6.8361,1726743000.0,18.1,64.6,4.0,6.5,83.0,1010.0,29.83,...,10.5,238.65,42.0,1.295,0.8,7.4,10.9,1.0,1.0,15.0
50%,17.25,23.3167,1737541000.0,25.0,77.0,6.9,11.2,165.0,1013.0,29.92,...,15.9,327.45,60.0,4.995,2.405,15.17,22.2,1.0,2.0,50.0
75%,40.4,50.58,1748422000.0,28.3,82.9,11.4,18.4,256.0,1017.5,30.05,...,24.4,507.4,79.0,17.945,9.065,29.97,46.065,2.0,3.0,85.0
max,64.15,179.22,1759132000.0,49.2,120.6,1841.2,2963.2,360.0,3006.0,88.77,...,2970.4,38879.398,480.7,427.7,521.33,1614.1,6037.29,6.0,10.0,100.0



Missing Values per Column:
country                         0
location_name                   0
latitude                        0
longitude                       0
timezone                        0
last_updated_epoch              0
last_updated                    0
temperature_celsius             0
temperature_fahrenheit          0
condition_text                  0
wind_mph                        0
wind_kph                        0
wind_degree                     0
wind_direction                  0
pressure_mb                     0
pressure_in                     0
precip_mm                       0
precip_in                       0
humidity                        0
cloud                           0
feels_like_celsius              0
feels_like_fahrenheit           0
visibility_km                   0
visibility_miles                0
uv_index                        0
gust_mph                        0
gust_kph                        0
air_quality_Carbon_Monoxide     0
air_quality_Ozone   

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)  # Fill numeric columns with mean
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Unknown', inplace=True)       # Fill categorical columns with 'Unknown'



Cleaned dataset saved as 'cleaned_dataset.csv'

Cleaned Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 32831 entries, 0 to 97434
Data columns (total 41 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       32831 non-null  object 
 1   location_name                 32831 non-null  object 
 2   latitude                      32831 non-null  float64
 3   longitude                     32831 non-null  float64
 4   timezone                      32831 non-null  object 
 5   last_updated_epoch            32831 non-null  float64
 6   last_updated                  32831 non-null  object 
 7   temperature_celsius           32831 non-null  float64
 8   temperature_fahrenheit        32831 non-null  float64
 9   condition_text                32831 non-null  object 
 10  wind_mph                      32831 non-null  float64
 11  wind_kph                      32831 non-null  float64