In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures,StandardScaler,RobustScaler
from sklearn.metrics import mean_squared_error, r2_score

Matplotlib is building the font cache; this may take a moment.


## Load the Dataset

In [13]:
df = pd.read_csv('weather_data_extended.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Location          800 non-null    object 
 1   Temperature (°C)  800 non-null    float64
 2   Feels Like (°C)   800 non-null    float64
 3   Humidity (%)      798 non-null    float64
 4   Wind Speed (kph)  798 non-null    float64
 5   Cloud Cover (%)   800 non-null    int64  
 6   Pressure (mb)     798 non-null    float64
 7   UV Index          800 non-null    int64  
 8   Visibility (km)   800 non-null    int64  
dtypes: float64(5), int64(3), object(1)
memory usage: 56.4+ KB


Unnamed: 0,Location,Temperature (°C),Feels Like (°C),Humidity (%),Wind Speed (kph),Cloud Cover (%),Pressure (mb),UV Index,Visibility (km)
0,New York,8.3,4.5,40.0,29.9,75,1003.0,2,16
1,New York,8.3,4.5,40.0,29.9,75,1003.0,2,16
2,New York,8.3,4.5,40.0,29.9,75,1003.0,2,16
3,New York,8.3,4.5,40.0,29.9,75,1003.0,2,16
4,New York,8.3,4.5,40.0,29.9,75,1003.0,2,16


## Check for Missing Values

In [14]:
df.isnull().sum()

Location            0
Temperature (°C)    0
Feels Like (°C)     0
Humidity (%)        2
Wind Speed (kph)    2
Cloud Cover (%)     0
Pressure (mb)       2
UV Index            0
Visibility (km)     0
dtype: int64

## Droping the missing value 

In [15]:
df = df.dropna()

In [16]:
df.isnull().sum()

Location            0
Temperature (°C)    0
Feels Like (°C)     0
Humidity (%)        0
Wind Speed (kph)    0
Cloud Cover (%)     0
Pressure (mb)       0
UV Index            0
Visibility (km)     0
dtype: int64

## Summary Statistics

In [18]:
df.describe()

Unnamed: 0,Temperature (°C),Feels Like (°C),Humidity (%),Wind Speed (kph),Cloud Cover (%),Pressure (mb),UV Index,Visibility (km)
count,794.0,794.0,794.0,794.0,794.0,794.0,794.0,794.0
mean,16.58073,16.373804,73.513854,16.1233,43.513854,1013.041562,1.746851,10.732997
std,5.574821,7.382814,16.478897,10.338986,34.845115,7.608453,1.092285,1.966105
min,8.3,4.5,40.0,3.6,0.0,900.0,1.0,10.0
25%,15.0,13.8,67.0,6.1,0.0,1012.0,1.0,10.0
50%,16.0,16.0,76.0,9.0,50.0,1013.0,1.0,10.0
75%,24.0,25.5,88.0,24.1,75.0,1019.25,2.0,10.0
max,25.0,27.8,100.0,29.9,75.0,1024.0,4.0,16.0


## String Indexing(convert text into number)

In [23]:
# String indexing on 'Location' column: Converting Location column from string to integer
df['Location_index'] = df['Location'].astype('category').cat.codes

# Getting unique location names
unique_locations = df['Location'].drop_duplicates().reset_index(drop=True)

# Getting unique location codes
unique_locations_indices = df['Location_index'].drop_duplicates().reset_index(drop=True)

# Create a new DataFrame with both columns
unique_df = pd.DataFrame({
  'Unique_Locations': unique_locations,
  'Unique_Locations_indices': unique_locations_indices
})

unique_df

Unnamed: 0,Unique_Locations,Unique_Locations_indices
0,New York,3
1,London,2
2,Tokyo,7
3,Paris,4
4,Sydney,6
5,Dubai,0
6,Rome,5
7,Hong Kong,1
