USING ARIMA MODEL TO PREDICT TIME SERIES DATA FORTHE NEXT 7 DAYS

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk

In [None]:
#loading the dataset
df = pd.read_csv('IndianWeatherRepository.csv')

In [None]:
#remove redundant attributes
col_to_remove=['country','latitude','longitude','timezone','last_updated_epoch','temperature_celsius','wind_mph','wind_direction','pressure_in','precip_in','feels_like_celsius','feels_like_fahrenheit','visibility_miles','gust_mph','uv_index','gust_kph','sunrise','sunset','moonrise','moonset','moon_phase','moon_illumination','location_name','air_quality_Carbon_Monoxide', 'air_quality_Ozone','air_quality_Nitrogen_dioxide','air_quality_Sulphur_dioxide','air_quality_PM2.5','air_quality_PM10','air_quality_us-epa-index','air_quality_gb-defra-index']
df = df.drop(columns=col_to_remove)
df

In [None]:
#extract the rows whose region name is Andaman and Nicobar Islands
condition=df['region']=='Andaman and Nicobar Islands'
df = df[condition]
df

In [None]:
#different weather names used
distinct_values = df['condition_text'].unique().tolist()
print(distinct_values)
print("There are "+str(len(distinct_values))+" distinct weather types in the dataset")

In [None]:
#percentage of missing values
missing_values=df.isnull()
miss_percent=(missing_values.sum()/len(df))*100
missing_data=pd.DataFrame({'missing percent': miss_percent})
print(missing_data)

In [None]:
#identify and remove duplicated rows
duplicated_rows=df.duplicated(keep=False)
print(duplicated_rows)
duplicate_data=df[duplicated_rows]
print(duplicate_data)
df=df.drop_duplicates(keep='first')
print('number of rows: ',df.shape[0])

In [None]:
#data type of each row
data_types = df.dtypes
print(data_types)

In [None]:
'''my_column = df['last_updated']
modified_column = my_column.apply(lambda x: x[:-5])
df['last_updated'] = modified_column
df
'''

In [None]:
#convert the int64 data to float64
df['humidity'] = df['humidity'].astype('float64')
df['wind_degree'] = df['wind_degree'].astype('float64')
df['cloud'] = df['cloud'].astype('float64')

data_types = df.dtypes
print(data_types)

ENCODING CONDITION TEXT

In [None]:

from sklearn.preprocessing import LabelEncoder

# Create an instance of LabelEncoder
encoder = LabelEncoder()

# Fit the encoder on the column and transform the values
df['condition_text'] = encoder.fit_transform(df['condition_text'])
df['condition_text'] = df['condition_text'].astype(int)

# Access the encoded classes
encoded_classes = encoder.classes_
#print("Encoded Classes:", encoded_classes)

# Decode the encoded values to get the original labels
decoded_labels = encoder.inverse_transform(df['condition_text'])
#print("Decoded Labels:", decoded_labels)

ARIMA FOR NEXT 7 DAYS

In [None]:
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Set the date column as the index
df['last_updated'] = pd.to_datetime(df['last_updated'])
df.set_index('last_updated', inplace=True)

# Select the parameters for prediction
parameters = ['temperature_fahrenheit', 'wind_kph', 'wind_degree', 'pressure_mb',
              'precip_mm', 'humidity', 'cloud', 'visibility_km']

# Create an empty DataFrame to store predictions
predictions_df = pd.DataFrame()
# Iterate over each parameter
for param in parameters:
    # Split the data into training and testing sets
    train_data = df[:-7][param]
    test_data = df[-7:][param]

    # Fit an ARIMA model
    model = ARIMA(train_data, order=(5, 2, 2))  # Adjust the order as needed
    model_fit = model.fit()

    # Make predictions for the next 7 days
    predictions = model_fit.forecast(steps=7)

    # Print the predictions
    print(f"Predictions for {param}:")
    print(predictions)
    print()
    predictions_df[param] = predictions

# Print the predictions DataFrame
print(predictions_df)

# Calculate accuracy metrics
mae = mean_absolute_error(test_data, predictions)
mse = mean_squared_error(test_data, predictions)
rmse = mean_squared_error(test_data, predictions, squared=False)

# Print the accuracy metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)

NORMALISE BEFORE CLASSIFICATION

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Select only the numerical columns
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns

# Create an instance of MinMaxScaler
scaler = MinMaxScaler()

# Fit the scaler on the numerical columns and transform the values
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
df


DECISION TREE CLASSIFIER

In [None]:
# we are now going to approach the solution using DECISION TREE CLASSIFIER
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Load the dataset
X = df.iloc[:, [1,3,4,5,6,7,8,9]].values
y = df.iloc[:, 2].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Create a decision tree classifier
clf = tree.DecisionTreeClassifier(random_state=18)

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100,"%")

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Select only the numerical columns
numerical_columns = predictions_df.select_dtypes(include=['float64', 'int64']).columns

# Create an instance of MinMaxScaler
scaler = MinMaxScaler()

# Fit the scaler on the numerical columns and transform the values
predictions_df[numerical_columns] = scaler.fit_transform(predictions_df[numerical_columns])
pred=clf.predict(predictions_df)
#print(pred)

# Decode the encoded values to get the original labels
decoded_labels = encoder.inverse_transform(pred)

# Print the decoded labels
print("Decoded Labels:", decoded_labels)

In [None]:
last_7_rows = df.tail(7)
#print(last_7_rows['condition_text'])
# Decode the encoded values to get the original labels
decoded_labels = encoder.inverse_transform(last_7_rows['condition_text'])

# Print the decoded labels
print("Decoded Labels:", decoded_labels)
