## Importing necessary libraries

In [1]:
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LinearRegression

In [2]:
# for data manipulation

import pandas as pd
import numpy as np
from numpy.typing import NDArray
from matplotlib import pyplot as plt
import seaborn as sns
import json
import category_encoders as ce
from category_encoders import wrapper
from scipy import stats

import yaml

## Reading the dataset

In [None]:
with open('config.yaml', 'r') as config:
    cfg = yaml.safe_load(config)
df = pd.read_csv(cfg["dataset"])

## Definition all necessary functions.
#### All implementation of these functions are described in dataset_analysis.ipynb also

In [3]:
def conversion_boolean_features(df: pd.DataFrame) -> pd.DataFrame:
    # Convert boolean features to float.
    # True -> 1.0, False -> 0.0
    bool_df = df.select_dtypes(exclude=['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'object'])
    for column in bool_df.columns:
        df[column] = df[column].map({True: 1.0, False: 0.0}).astype(float)
        
    return df
def add_column_mean_coordinates_for_series(start: pd.Series, end: pd.Series) -> pd.Series:
    # For series parameters
    
    # Add new column with mean coordinates.
    # If end value is NaN, fill it with start value.
    fill_end = end.fillna(start)
    # Calculate mean value.
    new_series = (start + fill_end) / 2
    return new_series

def add_mean_coordinates_for_frame(df: pd.DataFrame) -> pd.DataFrame:
    # For frame parameter
    
    # Add new columns with mean coordinates. 
    df['Lat'] = add_column_mean_coordinates_for_series(df['Start_Lat'], df['End_Lat'])
    df['Lng'] = add_column_mean_coordinates_for_series(df['Start_Lng'], df['End_Lng'])
    
    return df

# Columns with NaN values, that should be replaced
nan_num_column = ['Wind_Speed(mph)', #mean
                  'Visibility(mi)', #mean
                  'Humidity(%)', #mean
                  'Temperature(F)', #mean
                  'Pressure(in)', #mean
                  'Precipitation(in)' #mean
                  ]

def fill_num_columns(df: pd.DataFrame) -> pd.DataFrame:
    # Fill NaN values in columns with mean values.
    for col_name in nan_num_column:
        df[col_name] = df.groupby('Severity')[col_name].transform(lambda x: x.fillna(x.mean()))
        
    return df
def delete_emissions(df: pd.DataFrame) -> pd.DataFrame:
    # Delete rows with emissions.
    for col, upper_bound, down_bound in (
        ('Distance(mi)', 150, -150),
        ('Temperature(F)', 150, -50),
        ('Pressure(in)', 50, 10),
        ('Visibility(mi)', 100, -100),
        ('Wind_Speed(mph)', 400, -400),
        ("Precipitation(in)",10, -15),
    ):
        df = df.drop(df[df[col] > upper_bound].index)
        df = df.drop(df[df[col] < down_bound].index)
        
    return df
def add_new_time_columns(df: pd.DataFrame) -> pd.DataFrame:
    # Convert time columns to accurate format.
    df['Start_Time'] = pd.to_datetime(df['Start_Time']).dt.round("S")
    df['End_Time'] = pd.to_datetime(df['End_Time']).dt.round("S")
    df['Weather_Timestamp'] = pd.to_datetime(df['Weather_Timestamp']).dt.round("S")

    df.loc[df['Weather_Timestamp'].isna(), 'Weather_Timestamp'] = df['Start_Time'].loc[df['Weather_Timestamp'].isna()]

    df['Start_Date_Year'] = df['Start_Time'].dt.year
    df['Start_Date_Month'] = df['Start_Time'].dt.month
    df['Start_Date_Day'] = df['Start_Time'].dt.day
    df['Start_Date_Hour'] = df['Start_Time'].dt.hour

    df['End_Date_Year'] = df['End_Time'].dt.year
    df['End_Date_Month'] = df['End_Time'].dt.month
    df['End_Date_Day'] = df['End_Time'].dt.day
    df['End_Date_Hour'] = df['End_Time'].dt.hour

    df['Weather_Datestamp_Year'] = df['Weather_Timestamp'].dt.year
    df['Weather_Datestamp_Month'] = df['Weather_Timestamp'].dt.month
    df['Weather_Datestamp_Day'] = df['Weather_Timestamp'].dt.day
    df['Weather_Datestamp_Hour'] = df['Weather_Timestamp'].dt.hour
    
    return df

# New names for columns
new_twilight_names = {
    'Sunrise_Sunset': 'Is_Day',
    "Civil_Twilight": "Is_Civil_Day",
    "Nautical_Twilight": "Is_Twilight_Day",
    "Astronomical_Twilight": "Is_Astronomical_Day"
}
def rename_twillight_columns(df: pd.DataFrame) -> pd.DataFrame:
    # Rename columns with twilights to more understandable names.
    for column in new_twilight_names.keys():
        df[new_twilight_names[column]] = df[column].map({'Day': 1.0, 'Night': 0.0}).astype(float)
    df.drop(columns=list(new_twilight_names.keys()), inplace=True)
    
    return df

def rename_timezone_meanings(df: pd.DataFrame) -> pd.DataFrame:
    # Rename Timezone meanings.
    df['Timezone'] = df['Timezone'].replace({
        "US/Eastern": "Easterm",
        "US/Central": "Central",
        "US/Pacific": "Pacific",
        "US/Mountain": "Mountain"
    })
    
    return df

def rename_wind_direction_columns(df: pd.DataFrame) -> pd.DataFrame:
    # Rename Wind Direction meanings.
    df['Wind_Direction'] = df['Wind_Direction'].replace({'South': 'S',
                                                                     'West': 'W',
                                                                     'North': 'N',
                                                                     'Variable': 'VAR',
                                                                     'East': 'E',
                                                                     'Calm': 'CALM'
                                                                     })

    df['Wind_Direction'].fillna('VAR', inplace=True)
    
    return df

def drop_excess_parametres(df: pd.DataFrame) -> pd.DataFrame:
    # Drop columns with unnecessary information.
    df = df.drop(columns=['ID',
                           'Source',
                           'Start_Lat',
                           'Start_Lng',
                           'End_Lat',
                           'End_Lng',
                           'Wind_Chill(F)',
                           'Country',
                           'Zipcode',
                           'Airport_Code',
                           'Description',
                           'Street',
                           'Weather_Timestamp',
                           'Start_Time',
                           'End_Time',
                           ])
    
    df = df.dropna()
    
    return df

def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
    # Common function for renaming columns.
    df = rename_twillight_columns(df)
    df = rename_timezone_meanings(df)
    df = rename_wind_direction_columns(df)
    
    return df

def data_condersion(df: pd.DataFrame) -> pd.DataFrame:
    # Common function for data conversion.
    df = conversion_boolean_features(df)
    df = add_mean_coordinates_for_frame(df)
    df = fill_num_columns(df)
    df = delete_emissions(df)
    df = add_new_time_columns(df)
    df = rename_columns(df)
    
    return df
    
class WeatherEncoder(TransformerMixin):
    """
    A transformer class for encoding weather conditions into understandable features.

    This class takes a series of weather conditions as input and encodes them into understandable features
    based on a predefined set of replacement words. It implements the `fit` and `transform` methods
    required by the `TransformerMixin` interface.

    Attributes:
        words_ (set): A set of unique words extracted from the input data.
        replacement_words_ (dict): A dictionary mapping words to their replacement values.

    Methods:
        fit(X, y=None): Fit the transformer to the input data.
        transform(X): Transform the input data into understandable features.

    """

    def __init__(self) -> None:
        super().__init__()
        self.words_ = None
        self.replacement_words_ = None
        with open(cfg['type_weather']) as file:
            self.replacement_words_ = json.load(file)
            
    def fit(self, X: NDArray, y: NDArray = None) -> None:
        """
        Fit the transformer to the input data.

        This method extracts unique words from the input data, replaces them with their corresponding
        replacement values, and stores the unique words in the `words_` attribute.

        Args:
            X (ndarray): The input data.
            y (ndarray, optional): The target data. Defaults to None.

        Returns:
            None

        """
        self.words_ = set()
        Processed_Weather_Condition = X.dropna().apply(
            lambda x: ' '.join(
                set(
                    sorted(
                            [
                            self.replacement_words_[word] for word in x.split(" ") if word in self.replacement_words_.keys()
                            ]
                        )
                    )
                ) 
            )

        for index in Processed_Weather_Condition.index:
            if 'Mix' in Processed_Weather_Condition[index]:
                Processed_Weather_Condition[index] = 'Mix'

        X = Processed_Weather_Condition.replace({'': np.nan})
        X.fillna(X.mode()[0], inplace=True)

        for line in X:
            for word in line.split(" "):
                if not word in self.words_:
                    self.words_.add(word)
                    
        return self
                    
    def transform(self, X: pd.Series) -> pd.Series:      
        """
        Transform the input data into understable features.

        This method takes the input data and encodes it into understable features based on the unique words
        extracted during the fitting process.

        Args:
            X (pd.Series): The input data.

        Returns:
            pd.DataFrame: A DataFrame containing the understable features.

        """
        X.fillna(X.mode()[0], inplace=True)
        new_columns = np.zeros((len(self.words_), len(X)), dtype=float)

        for i, line in enumerate(X):
            line_words = set(line.split(" "))
            for k, word in enumerate(self.words_):
                if word in line_words:
                    new_columns[k][i] = 1.0

        return_columns = dict()
        for idx, word in enumerate(self.words_):
            new_column = pd.Series(new_columns[idx])
            new_column.index = X.index
            return_columns[word] = new_column
            
        return pd.DataFrame(return_columns)

In [101]:
# Load the dataset
df = pd.read_csv(cfg["dataset"])
df

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,Source2,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,Source2,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,0.01,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,0.01,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,0.01,...,False,False,False,False,True,False,Day,Day,Day,Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,A-1009761,Source2,2,2021-06-10 04:09:01,2021-06-10 05:45:47,28.436939,-81.348747,,,0.00,...,False,False,False,False,True,False,Night,Night,Night,Night
999996,A-1009762,Source2,2,2021-06-10 04:09:54,2021-06-10 05:45:44,28.426664,-81.307961,,,0.00,...,False,False,False,False,False,False,Night,Night,Night,Night
999997,A-1009763,Source2,2,2021-06-10 05:17:52,2021-06-10 06:42:43,27.765680,-82.641678,,,0.00,...,False,False,False,False,True,False,Night,Night,Night,Day
999998,A-1009764,Source2,2,2021-06-10 05:24:46,2021-06-10 06:42:42,27.901506,-82.637878,,,0.00,...,False,False,False,False,False,False,Night,Night,Night,Day


In [102]:
num_colums = df.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64']).columns

In [103]:
# Data manipulation
df = data_condersion(df)
df = drop_excess_parametres(df)

In [104]:
# Get the target column and feature columns

target_column = 'Severity'
feature_columns = list(df.columns)
feature_columns.remove(target_column)

X = df[feature_columns]
Y = df[target_column]

In [105]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [106]:
# Encoding the City column
count_encoder_city = ce.CountEncoder(normalize=True)
x_train['City_Encoded'] = count_encoder_city.fit_transform(x_train['City'])
x_test['City_Encoded'] = count_encoder_city.transform(x_test['City'])

In [107]:
# Encoding the County column
count_encoder_county = ce.CountEncoder(normalize=True)
x_train['County_Encoded'] = count_encoder_county.fit_transform(x_train['County'])
x_test['County_Encoded'] = count_encoder_county.transform(x_test['County'])

In [108]:
# Encoding the State column
label_encoder_state = LabelEncoder()
encoded = pd.Series(label_encoder_state.fit_transform(x_train['State']))
encoded.index = x_train.index
x_train['State_Encoded'] = encoded
encoded = pd.Series(label_encoder_state.transform(x_test['State']))
encoded.index = x_test.index
x_test['State_Encoded'] = encoded

In [109]:
# Encoding the Timezone and Wind_Direction columns
onehot_encoder = OneHotEncoder(sparse_output=False, min_frequency=5*10**(-4), handle_unknown="ignore")

encoded_categorical_columns_train = pd.DataFrame(onehot_encoder.fit_transform(x_train[['Timezone', 'Wind_Direction']]))
encoded_categorical_columns_test = pd.DataFrame(onehot_encoder.transform(x_test[['Timezone', 'Wind_Direction']]))

encoded_categorical_columns_train.columns = onehot_encoder.get_feature_names_out()
encoded_categorical_columns_train.index = x_train.index

encoded_categorical_columns_test.columns = onehot_encoder.get_feature_names_out()
encoded_categorical_columns_test.index = x_test.index

for column_name in onehot_encoder.get_feature_names_out():
    x_train[column_name] = encoded_categorical_columns_train[column_name]
    x_test[column_name] = encoded_categorical_columns_test[column_name]

In [110]:
# Encoding the Weather_Condition column
weather_transformer = WeatherEncoder()

encoded_train = weather_transformer.fit_transform(x_train['Weather_Condition'])
encoded_test = weather_transformer.transform(x_test['Weather_Condition'])

for word in weather_transformer.words_:
    x_train[word] = encoded_train[word]
    x_test[word] = encoded_test[word]

In [111]:
# Drop the original columns
x_train.drop(columns=['Timezone',
                      'Wind_Direction',
                      'Weather_Condition',
                      'State',
                      'County',
                      'City'
                      ], inplace=True)
x_test.drop(columns=['Timezone',
                      'Wind_Direction',
                      'Weather_Condition',
                      'State',
                      'County',
                      'City'
                      ], inplace=True)

In [112]:
# Feature columns
feature_columns = list(x_train.columns)

In [113]:
# Standardize the data
standard_scaler = StandardScaler()
x_train.loc[:, feature_columns] = standard_scaler.fit_transform(x_train)
x_test.loc[:, feature_columns] = standard_scaler.fit_transform(x_test)

In [114]:
len(x_train), len(y_train), len(x_test), len(y_test)
x_train

Unnamed: 0,Distance(mi),Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Precipitation(in),Amenity,Bump,Crossing,...,Rain,Dust,Whirls,Cloudy,Fog,Hail,Snow,Ice,Thunder,Mix
724212,-0.125238,-1.072822,1.143062,0.479091,0.369266,1.496071,-0.101755,-0.119052,-0.017715,-0.387069,...,-0.253852,-0.006961,0.0,1.506464,-0.111346,-0.001956,-0.10119,-0.007659,-0.070571,-0.026828
3957,-0.125238,2.033292,-2.429342,0.173988,0.369266,0.295964,-0.009684,-0.119052,-0.017715,-0.387069,...,-0.253852,-0.006961,0.0,-0.663806,-0.111346,-0.001956,-0.10119,-0.007659,-0.070571,-0.026828
855204,-0.125238,0.192632,-2.337742,-1.109983,0.369266,0.461496,0.032793,-0.119052,-0.017715,-0.387069,...,-0.253852,-0.006961,0.0,-0.663806,-0.111346,-0.001956,-0.10119,-0.007659,-0.070571,-0.026828
801341,-0.125238,-0.440095,-1.925542,0.517228,0.369266,0.047666,-0.101755,-0.119052,-0.017715,-0.387069,...,-0.253852,-0.006961,0.0,-0.663806,-0.111346,-0.001956,-0.10119,-0.007659,-0.070571,-0.026828
628599,-0.125238,0.710317,0.227061,0.479091,0.369266,0.461496,-0.101755,-0.119052,-0.017715,-0.387069,...,-0.253852,-0.006961,0.0,1.506464,-0.111346,-0.001956,-0.10119,-0.007659,-0.070571,-0.026828
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264895,-0.117590,-0.727698,0.776661,0.453665,-0.077112,-0.407547,-0.009684,-0.119052,-0.017715,-0.387069,...,-0.253852,-0.006961,0.0,-0.663806,-0.111346,-0.001956,-0.10119,-0.007659,-0.070571,-0.026828
373331,0.884337,-0.842739,1.051461,0.301114,0.369266,2.447880,-0.101755,-0.119052,-0.017715,-0.387069,...,-0.253852,-0.006961,0.0,-0.663806,-0.111346,-0.001956,-0.10119,-0.007659,-0.070571,-0.026828
134069,-0.125238,0.997920,-0.368340,0.491803,0.369266,1.020166,-0.009684,-0.119052,-0.017715,2.583518,...,-0.253852,-0.006961,0.0,1.506464,-0.111346,-0.001956,-0.10119,-0.007659,-0.070571,-0.026828
685564,-0.125238,-1.935631,-0.688940,0.695205,-0.009456,0.461496,-0.101755,-0.119052,-0.017715,-0.387069,...,-0.253852,-0.006961,0.0,-0.663806,-0.111346,-0.001956,-0.10119,-0.007659,-0.070571,-0.026828


## Models

### KNN

In [18]:
knn = KNeighborsClassifier(n_neighbors=300,
                           metric="euclidean",
                           algorithm="brute",
                           weights="uniform"
                           )

knn.fit(X=x_train, y=y_train)


In [19]:
y_train_pred = knn.predict(X=x_train)
y_test_pred = knn.predict(X=x_test)

print(
    f"train={accuracy_score(y_true=y_train, y_pred=y_train_pred)}"
    "\n"
    f"test={accuracy_score(y_true=y_test, y_pred=y_test_pred)}"
    )

train=0.6283921130196158
test=0.6235897957109462


Results:
1. With the algorithm "brute" and uniform weights, and using different metrics, we obtained approximately the same results for $10^5$ samples, and for $10^6$ samples, the results were about the same.

2. The Ball_Tree method gave approximately the same results.

3. The KD-tree method did not improve the results.

4. When the number of nearest neighbors was set to 300, the results became approximately equal. This suggests that the model may have been over fitted with a small number of training samples. The final results for both datasets were approximately 0.62.

5. For $10^6$ samples, it took approximately 22 minutes to complete, which suggests that it may not be suitable for very large datasets.

### Desision Tree

In [141]:
tree = DecisionTreeClassifier(max_depth=10, min_samples_leaf=200, criterion='gini', random_state=0)
tree.fit(X=x_train, y=y_train)

In [142]:
y_train_pred = tree.predict(X=x_train)
y_test_pred = tree.predict(X=x_test)

print(
    f"train={accuracy_score(y_true=y_train, y_pred=y_train_pred)}"
    "\n"
    f"test={accuracy_score(y_true=y_test, y_pred=y_test_pred)}"
    )

train=0.8389342025392648
test=0.8376768125035069


Results: 
1. The decision tree provides a reasonably accurate prediction, with an accuracy of approximately 0.8 on a dataset with $10^6$ elements. However, for Severity level 42, the prediction is not accurate at all.

2. By experimenting with the number of splits and maximum depth, it is possible to improve the accuracy. However, even with these optimizations, Severity 4 is still not accurately predicted.

3. Switching from the Gini impurity to the entropy criterion does not significantly change the results.

4. As the dataset size increases, the model begins to accurately predict Severity 4, although the proportion of correct predictions remains approximately 0.9.

5. With a maximum depth of 10 and a minimum number of samples per leaf of 500 and using the Gini criterion, the model accurately predicts all four Severity levels.

### Random Forest

In [36]:
random_forest = RandomForestClassifier(n_estimators=10,
                                       max_depth=5,
                                       min_samples_leaf=30,
                                       criterion="gini",
                                       random_state=0,
                                       )
random_forest.fit(X=x_train, y=y_train)

In [37]:
y_train_pred = random_forest.predict(X=x_train)
y_test_pred = random_forest.predict(X=x_test)

print(
    f"train={accuracy_score(y_true=y_train, y_pred=y_train_pred)}"
    "\n"
    f"test={accuracy_score(y_true=y_test, y_pred=y_test_pred)}"
    )

train=0.9651641426974286
test=0.9687976420367923


Results: <br>
The decision tree performs better, which is likely due to the presence of a large number of instances with two and three features. The random forest takes into account specific features, while also attempting to utilize various indicators. However, there is an excessive amount of data with two and three attributes, and everything falls within these categories.


### Bagging Classifier

In [38]:
bagging = BaggingClassifier(n_estimators=10,
                            max_samples=0.8,
                            max_features=0.9,
                            random_state=0,
                            estimator=DecisionTreeClassifier(max_depth=8,
                                                             min_samples_leaf=200,
                                                             criterion='gini',
                                                             random_state=0
                                                             )
                            )
bagging.fit(X=x_train, y=y_train)

In [39]:
y_train_pred = bagging.predict(X=x_train)
y_test_pred = bagging.predict(X=x_test)

print(
    f"train={accuracy_score(y_true=y_train, y_pred=y_train_pred)}"
    "\n"
    f"test={accuracy_score(y_true=y_test, y_pred=y_test_pred)}"
    )

train=0.9766871633296067
test=0.9763187315784124


Results:

1) Using Bagging with a Decision Tree Classifier with n_estimators = 3, max_samples = 0.8 and max_features = 0.9 produces very good results. At $10^5$, the accuracy is approximately 0.97, and the model correctly predicts all Severity levels.

2) However, when using a larger dataset, retraining the model does not significantly improve the results. Changing parameters also does not produce a significant impact on the situation.

3) Using a Decision Tree Classifier for the estimator does not lead to full retraining of the dataset, but the accuracy decreases to 0.89. It is worth noting, however, that 1 and 4 were not predicted well.

### SVM

In [40]:
svm = SVC(kernel="sigmoid",
          max_iter=8000,
          random_state=0
          )
svm.fit(X=x_train, y=y_train)



In [41]:
y_train_pred = svm.predict(X=x_train)
y_test_pred = svm.predict(X=x_test)

print(
    f"train={accuracy_score(y_true=y_train, y_pred=y_train_pred)}"
    "\n"
    f"test={accuracy_score(y_true=y_test, y_pred=y_test_pred)}"
    )

train=0.5725175322695396
test=0.5743978046549446


Results:

1) A Support Vector Machine (SVM) with a linear kernel is not effective. The accuracy of the model is 0.54.

2) An SVM with a polynomial kernel also does not yield good results. The accuracy remains at 0.44, even when the degree of the polynomial is varied.

3) An SVM using a radial basis function (RBF) also proves to be ineffective, with an accuracy of 0.47.

4) Finally, an SVM with the sigmoid kernel also fails to deliver accurate predictions, with an average accuracy of 0.57.

### Logistic Regression

In [173]:
logistic_regression = LogisticRegression(multi_class='multinomial', solver='lbfgs')
logistic_regression.fit(X=x_train, y=y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [174]:
y_train_pred = logistic_regression.predict(X=x_train)
y_test_pred = logistic_regression.predict(X=x_test)

print(
    f"train={accuracy_score(y_true=y_train, y_pred=y_train_pred)}"
    "\n"
    f"test={accuracy_score(y_true=y_test, y_pred=y_test_pred)}"
    )

train=0.7907031493025745
test=0.7905310435720913


In [204]:
class BinLogisticClassifier(BaseEstimator):
    def __init__(self, classes: list | None = None) -> None:
        super().__init__()
        self.classes_ = classes
        self._Regressors = [
            LogisticRegression(multi_class="auto", solver="lbfgs"), # x_1, x_2 | x_3, x_4
            LogisticRegression(multi_class="auto", solver="lbfgs"), # x_1 | x_2
            LogisticRegression(multi_class="auto", solver="lbfgs") # x_3 | x_4
        ]
        
    def fit(self, X: NDArray, y: NDArray) -> None:
        train_classes = set(y)
        if self.classes_ == None:
            self.classes_ = list(train_classes)
        elif not self.classes_ == train_classes:
            ValueError("Not all classes were passed during initialization") 
        
        first_cls, second_cls = self.classes_[:2], self.classes_[2:]
        
        y_train_bin = [
            0 if cls in first_cls else 1
            for cls in y 
            ]
        
        self._Regressors[0].fit(X=X, y=y_train_bin)
        
        mask = y.isin(first_cls)
        
        first_X = X.loc[mask]
        second_X = X.loc[~mask]
        
        first_y_train_bin = pd.Series([
            0 if cls == first_cls[0] else 1
            for cls in y.loc[mask]
        ])
        second_y_train_bin = pd.Series([
            0 if cls == second_cls[0] else 1
            for cls in y.loc[~mask]
        ])
        
        self._fit(X=first_X, y=first_y_train_bin, bin_cls=0)
        self._fit(X=second_X, y=second_y_train_bin, bin_cls=1)
        
    def _fit(self, X: pd.DataFrame, y: pd.Series, bin_cls: int) -> None:
        self._Regressors[1 + bin_cls].fit(X=X, y=y)
        
    def predict(self, X: NDArray) -> NDArray:
        y_bin = self._Regressors[0].predict(X=X)
        mask = y_bin == 0
        
        first_X = X.loc[mask]
        second_X = X.loc[~mask]
                
        y_bins = [None, None]
        y_bins[0] = self._predict(first_X, 0)
        y_bins[1] = self._predict(second_X, 1)
        
        k = [-1, -1]
        encoded_classes = list()
        
        for cls in y_bin:
            k[cls] += 1
            encoded_classes.append((cls, y_bins[cls][k[cls]]))
            
        return np.array([
            self._decode(cls) for cls in encoded_classes
        ])
            
    def _predict(self, X: pd.DataFrame, bin_cls: int) -> NDArray:
        return self._Regressors[1 + bin_cls].predict(X=X)
    
    def _decode(self, code: tuple):
        code_book = {
            (0, 0): self.classes_[0],
            (0, 1): self.classes_[1],
            (1, 0): self.classes_[2],
            (1, 1): self.classes_[3]
        }
        return code_book[code]
    

In [205]:
bin_logistic_regression = BinLogisticClassifier([1, 4, 2, 3])
bin_logistic_regression.fit(X=x_train, y=y_train)

In [206]:
y_train_pred = bin_logistic_regression.predict(X=x_train)
y_test_pred = bin_logistic_regression.predict(X=x_test)

print(
    f"train={accuracy_score(y_true=y_train, y_pred=y_train_pred)}"
    "\n"
    f"test={accuracy_score(y_true=y_test, y_pred=y_test_pred)}"
    )

ValueError: Found array with 0 sample(s) (shape=(0, 73)) while a minimum of 1 is required by LogisticRegression.

In [332]:
class LogisticRegressionReductingClassifier(BaseEstimator):
    def __init__(self, classes: list | None = None) -> None:
        super().__init__()
        self.classes_ = classes
        self._Regressors = [
            LogisticRegression(multi_class="auto", solver="lbfgs"), # x_1, x_2, x_3 | x_4
            LogisticRegression(multi_class="auto", solver="lbfgs"), # x_1,  x_2 | x_3
            LogisticRegression(multi_class="auto", solver="lbfgs") # x_1 | x_2
        ]
        
    def fit(self, X: NDArray, y: NDArray) -> None:
        train_classes = set(y)
        if self.classes_ == None:
            self.classes_ = list(train_classes)
        elif not self.classes_ == train_classes:
            ValueError("Not all classes were passed during initialization") 
            
        for i in range(len(self._Regressors)):
            classes = self.classes_[:len(self._Regressors) + 1 - i]
            mask = y.isin(classes)
            self._fit(X=X[mask], y=y[mask], n_regressor=i, classes=classes)
        
    def _fit(self, X: pd.DataFrame, y: pd.Series, n_regressor: int, classes: list) -> None:
        first_cls, second_cls = classes[:-1], classes[-1]
        
        y_train_bin = [
            0 if cls in first_cls else 1
            for cls in y 
            ]
        
        self._Regressors[n_regressor].fit(X=X, y=y_train_bin)
        
    def predict(self, X: pd.DataFrame) -> NDArray:
        
        y_predict = np.zeros(len(X), dtype=int)
        
        for i in range(len(self._Regressors)):
            y_bin = self._predict(X=X, n_regressor=i)
            mask = y_bin == 0
            X = X.loc[mask]
            k = 0
            for idx, y in enumerate(y_predict):
                if y == 0:
                    if y_bin[k] == 1:
                        y_predict[idx] = self.classes_[-(i + 1)]
                    k += 1
                    
            if len(X) == 0: 
                return y_predict
            
        return y_predict
            
            
    def _predict(self, X: pd.DataFrame, n_regressor: int) -> NDArray:
        return self._Regressors[n_regressor].predict(X=X)
    

In [329]:
logistic_regression_reducting = LogisticRegressionReductingClassifier([1, 4, 3, 2])
logistic_regression_reducting.fit(X=x_train, y=y_train)

In [330]:
y_train_pred = logistic_regression_reducting.predict(X=x_train)
y_test_pred = logistic_regression_reducting.predict(X=x_test)

print(
    f"train={accuracy_score(y_true=y_train, y_pred=y_train_pred)}"
    "\n"
    f"test={accuracy_score(y_true=y_test, y_pred=y_test_pred)}"
    )

0
0
train=0.6371709523325542
test=0.6441711556052444


Results:

1) Based on a large dataset, logistic regression produces a fairly accurate result, approximately 0.8.

2) If you use a sufficiently large dataset, the main issue is that during the first iteration of the regression, there is a tendency for the model to overfit, which can lead to problems and rarely produces accurate predictions for 1 and 4.

### Naive Bassian Classifier

In [44]:
bassian = GaussianNB()
bassian.fit(X=x_train, y=y_train)

In [45]:
y_train_pred = bassian.predict(X=x_train)
y_test_pred = bassian.predict(X=x_test)

print(
    f"train={accuracy_score(y_true=y_train, y_pred=y_train_pred)}"
    "\n"
    f"test={accuracy_score(y_true=y_test, y_pred=y_test_pred)}"
    )

train=0.04845512755361317
test=0.552393535928448


Results:

Due to the fact that the data is unbalanced, the results are not accurate. Additionally, it should be noted that the assumption of uncorrelated data may not be valid. There is a slight error in our analysis.

### Linear Discriminant Analysis

In [53]:
discriminant_analyse = LinearDiscriminantAnalysis(solver='lsqr')
discriminant_analyse.fit(X=x_train, y=y_train)

In [54]:
y_train_pred = discriminant_analyse.predict(X=x_train)
y_test_pred = discriminant_analyse.predict(X=x_test)

print(
    f"train={accuracy_score(y_true=y_train, y_pred=y_train_pred)}"
    "\n"
    f"test={accuracy_score(y_true=y_test, y_pred=y_test_pred)}"
    )

train=0.6319239760138226
test=0.6391401565199716


Результаты: <br/>


In [23]:
count_predicted_Severity = dict.fromkeys([1, 2, 3, 4], 0)
count_real_Severity = dict.fromkeys([1, 2, 3, 4], 0)

for p, r in zip(y_train_pred, y_train):
    count_real_Severity[r] += 1
    count_predicted_Severity[p] += 1
    
count_real_Severity, count_predicted_Severity

({1: 73, 2: 43259, 3: 35354, 4: 26}, {1: 0, 2: 43321, 3: 35391, 4: 0})