### Import Libraries

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.color_palette("Spectral")

### Load Data

In [None]:
url='https://drive.google.com/file/d/16UTzifXtnQWoStzn22AyfEqGDOAGbZW_/view?usp=sharing'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
df = pd.read_csv(url)
df

### Basic Exploration

In [None]:
# number of rows and columns
df.shape

In [None]:
# quick overview on data
df.info()

In [None]:
# list of columns' name 
df.columns

In [None]:
# check if there is null values in each column
df.isna().sum()

In [None]:
# check the unique values in each column
df.nunique()

In [None]:
# basic descriptive statistics for numerical features 
df.describe()

In [None]:
# basic descriptive statistics for categorical features
df.describe(include='O')

### Preprossing

In [None]:
# drop the index unnamed column
df=df.drop(['Unnamed: 0','time','date'],axis=1)

In [None]:
# remove the % from humidity and convert its type to float
df['humidity']=df['humidity'].str.rstrip('%')
df['humidity']=df['humidity'].astype('float')

In [None]:
# fill the null values by mean
df['barometer'].fillna(df['barometer'].mean(),inplace=True)
df['humidity'].fillna(df['humidity'].mean(),inplace=True)

In [None]:
# drop any rows still have a null value
df.dropna(inplace=True)

In [None]:
# df.weather.value_counts()

In [None]:
# the classes and its counter before
weather=pd.Series(df.weather.value_counts())
df_weather=pd.DataFrame(weather.items(),columns=['category','count'])
df_weather

In [None]:
df.city.value_counts()

In [None]:
df['weather']=df['weather'].str.lower()

In [None]:
def weather_classes_mapping(df,word,classType):
  """
      looking for every entry and if the value has the word
      then map it to class type 
            Parameters:
                    df (DataFrem): data
                    word (str): the word to search it
                    classType(str): target class 
  """
  df.loc[df['weather'].str.contains(word), 'weather'] =classType

In [None]:
weather_classes_mapping(df,'clouds','Cloudy')
weather_classes_mapping(df,'cloudy','Cloudy')
weather_classes_mapping(df,'rain','Rain')
weather_classes_mapping(df,'sunny','Sunny')
weather_classes_mapping(df,'duststorm','Duststorm')
weather_classes_mapping(df,'fog','Fog')
weather_classes_mapping(df,'clear','Clear')
weather_classes_mapping(df,'haze','Haze')
weather_classes_mapping(df,'sandstorm','Sandstorm')
weather_classes_mapping(df,'overcast','Overcast')

In [None]:
# the classes and its counter after
weather=pd.Series(df.weather.value_counts())
df_weather=pd.DataFrame(weather.items(),columns=['category','count'])
df_weather

In [None]:
# extract the data with these classes 
df_new=df[df['weather'].isin(["Cloudy","Rain","Sunny","Duststorm","Fog","Clear","Haze","Sandstorm","Overcast"])]

In [None]:
fig, axes = plt.subplots(5, figsize = (18,18))
sns.boxplot(x='wind', data=df ,palette='Spectral',ax=axes[0])
axes[0].set_xlabel('wind')
axes[0].set_ylabel('Frequency')
sns.boxplot(x='temp',data=df, palette='Spectral',ax=axes[1])
axes[1].set_xlabel('temp')
axes[1].set_ylabel('Frequency')
sns.boxplot(x='humidity',data=df,palette='Spectral', ax=axes[2])
axes[2].set_xlabel('humidity')
axes[2].set_ylabel('Frequency')
sns.boxplot(x='barometer',data=df,palette='Spectral', ax=axes[3])
axes[3].set_xlabel('barometer')
axes[3].set_ylabel('Frequency')
sns.boxplot(x='visibility',data=df, palette='Spectral',ax=axes[4])
axes[4].set_xlabel('visibility')
axes[4].set_ylabel('Frequency')
fig.suptitle('Detect the outliers of different weather metrics and attributes (Before removing )',fontsize=25)
fig.show()

In [None]:
def IQR(df,col):
  """
  The aim of this function applied the IQR method to remove outliers on the numerical values
              Parameters:
                    df (DataFrem): data
                    col(str): name of column 
  """
 
  percentile25 = df[col].quantile(0.25)
  percentile75 = df[col].quantile(0.75)
  iqr=percentile75-percentile25
  upper_limit = percentile75 + 1.5 * iqr
  lower_limit = percentile25 - 1.5 * iqr
  new_df = df[df[col] < upper_limit]
  df = new_df[new_df[col]> lower_limit]
  return df



In [None]:
ls=['temp', 'wind', 'humidity', 'barometer', 'visibility']
for col in ls:
  df=IQR(df,col)

In [None]:
fig, axes = plt.subplots(5, figsize = (18,18))
sns.boxplot(x='wind', data=df ,palette='Spectral',ax=axes[0])
axes[0].set_xlabel('wind')
axes[0].set_ylabel('Frequency')
sns.boxplot(x='temp',data=df, palette='Spectral',ax=axes[1])
axes[1].set_xlabel('temp')
axes[1].set_ylabel('Frequency')
sns.boxplot(x='humidity',data=df,palette='Spectral', ax=axes[2])
axes[2].set_xlabel('humidity')
axes[2].set_ylabel('Frequency')
sns.boxplot(x='barometer',data=df,palette='Spectral', ax=axes[3])
axes[3].set_xlabel('barometer')
axes[3].set_ylabel('Frequency')
sns.boxplot(x='visibility',data=df, palette='Spectral',ax=axes[4])
axes[4].set_xlabel('visibility')
axes[4].set_ylabel('Frequency')
fig.suptitle('Detect the outliers of different weather metrics and attributes (After removing )',fontsize=25)
fig.show()

In [None]:
# save clean data
df_new.to_csv("Weather_Data_Clean.csv")