## Import

In [1]:
import pandas as pd
import numpy as np

## Exploring data

### Read raw data from file

In [2]:
weather_df = None
weather_df = pd.read_csv('../data/internal/weather.csv')

In [3]:
weather_df = weather_df.drop("Rain", axis='columns')

In [4]:
weather_df.head()

Unnamed: 0,Country,Name,Weather,Temp,Humidity,Visibility,Wind speed,Clouds
0,JP,Tokyo,Clouds,283.3,43,10000,5.66,75
1,ID,Pecenongan,Clouds,299.01,86,6000,1.54,40
2,IN,New Delhi,Smoke,296.25,43,2200,0.0,40
3,CN,Guangzhou,Clear,295.07,44,10000,2.04,9
4,IN,Konkan Division,Smoke,304.15,62,3000,3.09,40


### How many rows and how many columns does the raw data have?

In [5]:
shape = None 
shape = weather_df.shape
print(shape)

(1001, 8)


### What does each line mean?


Each line contains information about the weather of a city.

### Does the raw data have duplicate rows?

In [6]:
num_duplicated_rows = None
num_duplicated_rows = weather_df.duplicated().sum()

In [7]:
#TEST
if num_duplicated_rows == 0:
    print(f"The raw data have no duplicated line!")
else:
    if num_duplicated_rows > 1:
        ext = "lines"
    else:
        ext = "line"
    print(f"The raw data have {num_duplicated_rows} duplicated " + ext + ". Please de-deduplicate the raw data!")

The raw data have 1 duplicated line. Please de-deduplicate the raw data!


In [8]:
#De-deduplicate the raw data
weather_df = weather_df.drop_duplicates()

In [9]:
#TEST
num_duplicated_rows = weather_df.duplicated().sum()
if num_duplicated_rows == 0:
    print(f"The raw data have no duplicated line!")
else:
    if num_duplicated_rows > 1:
        ext = "lines"
    else:
        ext = "line"
    print(f"The raw data have {num_duplicated_rows} duplicated " + ext + ". Please de-deduplicate your raw data!")

The raw data have no duplicated line!


### What does each column mean?

- Country: Country ID
- Name: City's name
- Weather: Weather situation
- Temp: Current temperature
- Humidity: Current humidity
- Visibility: Current vision
- Wind speed: Current wind speed
- Clouds: Cloud frequency

### What data type does each column currently have? Are there any columns whose data types are not suitable for further processing?

In [10]:
dtypes = None
dtypes = weather_df.dtypes

In [11]:
dtypes

Country        object
Name           object
Weather        object
Temp          float64
Humidity        int64
Visibility      int64
Wind speed    float64
Clouds          int64
dtype: object

There are not any columns whose data types are not suitable for further processing

### For each column with numeric data type, how are the values distributed?

In [12]:
num_col_info_df = None

num_col_info_df = weather_df.select_dtypes(include=np.number).copy()

def missing_ratio(column):
    return ((column.isnull().sum() / column.shape[0]) * 100).round(1)

def lower_quartile(column):
    return (column.quantile(0.25)).round(1)

def median(column):
    return (column.median())

def upper_quartile(column):
    return (column.quantile(0.75)).round(1)

# Làm tròn giá trị đến 1 chữ số thập phân
num_col_info_df = num_col_info_df.round(1)

num_col_info_df = num_col_info_df.agg([missing_ratio, "min", lower_quartile, median, upper_quartile, "max"])

In [13]:
num_col_info_df

Unnamed: 0,Temp,Humidity,Visibility,Wind speed,Clouds
missing_ratio,0.0,0.0,0.0,0.0,0.0
min,253.1,8.0,121.0,0.0,0.0
lower_quartile,278.8,53.8,10000.0,1.5,20.0
median,288.65,68.0,10000.0,2.6,57.0
upper_quartile,297.7,81.0,10000.0,4.1,88.0
max,311.3,100.0,10000.0,16.0,100.0


### Drop the column of which the percentage of missing values is greater than 75%

In [14]:
def drop_missing_features(df: pd.DataFrame, missing_lst: dict = dict(num_col_info_df.iloc[0]), threshold: float = 75.0) -> pd.DataFrame:
    for col, missing_rate in missing_lst.items():
        if missing_rate > threshold:
            del df[col]
    return df

In [15]:
weather_df = drop_missing_features(weather_df)

In [16]:
weather_df.head()

Unnamed: 0,Country,Name,Weather,Temp,Humidity,Visibility,Wind speed,Clouds
0,JP,Tokyo,Clouds,283.3,43,10000,5.66,75
1,ID,Pecenongan,Clouds,299.01,86,6000,1.54,40
2,IN,New Delhi,Smoke,296.25,43,2200,0.0,40
3,CN,Guangzhou,Clear,295.07,44,10000,2.04,9
4,IN,Konkan Division,Smoke,304.15,62,3000,3.09,40


### For each column with a non-numeric data type, how are the values distributed?

In [17]:
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_columns', None)

In [18]:
cat_col_info_df = weather_df.select_dtypes(exclude=[np.number])

def missing_ratio(column):
    return ((column.isnull().sum() / column.shape[0]) * 100).round(1)

# Hàm tính số lượng giá trị
def num_values(column):
    return column.nunique()

# Hàm tính tỷ lệ của từng giá trị
def value_ratios(column):
    value_counts = column.value_counts() #Đếm số lượng của mỗi loại value trong 1 cột
    non_missing_count = value_counts.sum() #Tổng số lượng của tất cả value trong 1 cột
    ratios = (value_counts / non_missing_count * 100).round(1) #Lưu tỉ lệ vào Series
    ratios_dict = ratios.to_dict()
    sorted_ratios_dict = dict(sorted(ratios_dict.items(), key=lambda item: item[1], reverse=True))
    return sorted_ratios_dict

cat_col_info_df = cat_col_info_df.agg([missing_ratio, num_values, value_ratios])
cat_col_info_df

Unnamed: 0,Country,Name,Weather
missing_ratio,0.2,0.1,0.0
num_values,236,984,10
value_ratios,"{'CN': 35.8, 'IN': 5.8, 'US': 5.5, 'BR': 2.2, 'ID': 2.0, 'MX': 1.8, 'TR': 1.7, 'RU': 1.6, 'JP': ...","{'Kowloon': 0.2, 'Huai'an': 0.2, 'George Town': 0.2, 'La Paz': 0.2, 'Kingston': 0.2, 'Comayagüel...","{'Clouds': 64.4, 'Clear': 19.6, 'Rain': 7.8, 'Haze': 2.2, 'Snow': 1.8, 'Mist': 1.5, 'Smoke': 1.2..."


### Save the processed data

In [19]:
print(f"Total number of features: {weather_df.shape[1]}")
weather_df.dtypes

Total number of features: 8


Country        object
Name           object
Weather        object
Temp          float64
Humidity        int64
Visibility      int64
Wind speed    float64
Clouds          int64
dtype: object

In [20]:
save_name = "weather.csv"
weather_df.to_csv("../data/internal/" + save_name, index=False)