In [None]:
import numpy as np
import pandas as pd
import os
import arff
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score,confusion_matrix, precision_score, recall_score, f1_score

np.random.seed(seed=42)

## Data Loading and Preprocessing

In this section we load the dataset an perform the necessary preprocessing steps to make it availavle for the analysis. In particular we go throught the following steps:

- Load all the .arff files in pandas dataframes
- convert all the numeric values of the dataframes from strings to floats
- check that the only string in the dfs (apart from the ones indicating the country) is 'm', which indicates a missing values
- replace all missing values 'm' with the mean value of the corresponding column

In particular, for the last step we first consider just the values that have the same country of the missing value. Then we calculate the mean. If there are no values about that country, we simply consider the mean of the column

In [None]:
def create_df(folder='data'):
    """
        Create DataFrame from ARFF files.

        Returns:
        - df_ls (list of pandas.DataFrame): List of DataFrames created from ARFF files.
    """

    files = sorted(os.listdir(folder)) # Sorting files in the directory
    df_ls = [] # Initializing list to store DataFrames

    for f in files:
        print(f)
        file = folder + "/" + f # Creating file path
        dataset = arff.load(open(file, 'r')) # Loading ARFF file
        df = pd.DataFrame(dataset['data']) # Creating DataFrame
        df.insert(0, 'Quarter', f[:7]) # Inserting 'Quarter' column
        df_ls.append(df) # Appending DataFrame to the list

    
    return df_ls

def convert_to_float(val):
    """
        Convert a value to float.

        Parameters:
        - val: Value to be converted.

        Returns:
        - float_val: Converted float value.
        """
    try:
        return float(val) # Convert the value to float
    except ValueError:
        return val  # Return the original value for non-convertible strings

def check_unique_strings(df_ls):
    """
        Check if DataFrame contains only one unique string.

        Parameters:
        - df_ls (list of pandas.DataFrame): List of DataFrames.

        Raises:
        - ValueError: If DataFrame contains multiple unique strings.
    """
    for i in range(len(df_ls)):
        df = df_ls[i].iloc[:, 2:] # Extracting relevant columns from DataFrame

        unique_strings = df.stack().unique() # Finding unique values in the DataFrame
        # Filter out non-string values
        strings = [string for string in unique_strings if isinstance(string, str)]
                            
        if len(strings) > 1: # Checking if more than one unique string exists
            raise ValueError("The DataFrame contains multiple unique strings, 'm' is not the only one")


def check_specific_value(df_ls, specific_value='m'):
    """
        Check if a specific value exists as the only unique value in a DataFrame column.

        Parameters:
        - df_ls (list of pandas.DataFrame): List of DataFrames.
        - specific_value (str): Specific value to be checked. Default is 'm'.
    """
    
    for i in range(len(df_ls)):
        df = df_ls[i]
        cols = df.columns[2:-1]

        for col in cols:
            # Count the occurrences of each unique value in the column
            value_counts = df[col].value_counts()

            # Check if there is only one unique value and if it matches the specific value
            if len(value_counts) == 1 and value_counts.index[0] == specific_value:
                print(f"In DataFrame {i}, the column '{col}' contains only the specific value '{specific_value}'.")


def replace_m(df_ls):
    """
        Replace 'm' values in DataFrame with the mean of non-'m' values.

        Parameters:
        - df_ls (list of pandas.DataFrame): List of DataFrames.

        Returns:
        - df_ls (list of pandas.DataFrame): List of DataFrames with 'm' replaced.
    """
    for i in range(len(df_ls)):
        df = df_ls[i]
        countries = np.unique(df[0].values) # Get unique country values
        cols = df.columns[2:-1] # Get relevant columns excluding the first two and the last one

        for col in cols: # Iterate over columns
            for country in countries:

                mask1 = df[col] != 'm' # Create mask to filter out 'm' values
                mask2 = df[0] == country # Create mask to filter out rows corresponding to the current country

                mask_1_2_arr = df[mask1 & mask2][col].values # Apply both masks and get the values
                mask_1_arr = df[mask1][col].values # Apply only the first mask and get the values

                if len(mask_1_2_arr)!=0: # Check if there are non-'m' values for the current country
                    df[col].replace('m', np.mean(mask_1_2_arr), inplace=True) # Replace 'm' with mean of non-'m' values
                elif len(mask_1_2_arr)==0 and len(mask_1_arr)!=0: # Check if there are non-'m' values for other countries
                    df[col].replace('m', np.mean(mask_1_arr), inplace=True) # Replace 'm' with mean of non-'m' values


In [None]:
df_ls = create_df(folder='data')

for df in df_ls:
    df.iloc[:, 2:] = df.iloc[:, 2:].applymap(convert_to_float) # Convert values to float

In [None]:
check_unique_strings(df_ls)
check_specific_value(df_ls, specific_value='m')
replace_m(df_ls)

## Data Analysis of Pre/Post Covid trends

In this section we analyze the varation of the features of the dataset before and after Covid (pre <= Q1 2020, after >= Q2 2020). To evaluate the variation we first split the dataset in pre and post covid. Then we have to verify if there is a statistically significant change. Since we have many features to analyze we need a metric able to summarize. A possible solution would be to calculate for each feature the mean value and observe the variation. But during the analysis we have observed that some features are characterized by large otuliers, which strongly influence the mean value and could lead to a misleading analysis. For this reason we choose a metric that still gives an estimate of the value of the feature and it is not affeted by outliers: the median. So what we finally calculate is:

- variation = |(median_pre -  median_post)/median_pre| * 100

In the case in which the median_pre is 0 we simply do variation = |median_pre -  median_post|*100 to avoid division by zero.

In [None]:
def calc_variation(median_pre,median_post):
    """
        Calculate the percentage variation between two mean values.

        Parameters:
        - mean_pre (float): Mean value before a certain event.
        - mean_post (float): Mean value after a certain event.

        Returns:
        - variation (float): Percentage variation between mean_pre and mean_post.
    """

    n = median_post - median_pre # Calculating the difference between
    if median_pre==0.: # Checking if mean_pre is equal to 0
        return np.abs(n)*100 # Calculating and returning absolute variation
    else:
        return np.abs(n/median_pre)*100 # Calculating and returning relative variation


def plot_variations(variations,title='',custom_y_tick=False,color='skyblue', figsize=(15, 10), fontsize=25):
    """
        Plot the variations.

        Parameters:
        - variations (list): List of variation values for each feature.
        - title (str): Title for the plot. Default is an empty string.
        - custom_y_tick (bool): Whether to use custom y-axis ticks. Default is False.
        - color (str): Color for the bars in the plot. Default is 'skyblue'.
    """

    # Define the x labels and positions
    x_labels = np.arange(len(variations))+1
    x_positions = np.arange(len(variations))

    plt.figure(figsize=figsize)

    # Plot the bar chart with adjusted width
    plt.bar(x_positions, variations, width=0.6,color=color, edgecolor='black')

    # Add labels and title
    plt.xlabel('Feature', fontsize=fontsize)
    plt.ylabel('% Variation', fontsize=fontsize)
    plt.title(f'% Variation {title} before and after Covid (pre <= Q1 2020, after >= Q2 2020)', fontsize=fontsize+2, pad=20)

    # Set custom x tick labels
    plt.xticks(x_positions, x_labels, fontsize=10)

    if custom_y_tick==True:
        # Set y ticks 
        plt.yticks(np.arange(0, max(variations) + 25, 25), fontsize=16)


    # Rotate x tick labels for better readability
    plt.xticks(rotation=45)

    # Show plot
    plt.tight_layout()
    name = f'% Variation {title} before and after Covid (pre <= Q1 2020, after >= Q2 2020)'
    #plt.savefig("plots/"+name, bbox_inches='tight')

In [None]:
Q_ls = [np.unique(df['Quarter'].values) for df in df_ls] # Extracting unique quarters from each DataFrame
idx = Q_ls.index('2020 Q2') # Finding the index of '2020 Q2'

df_ls_pre = df_ls[:idx] # Slicing DataFrame list to get pre-COVID data
df_ls_post = df_ls[idx:] # Slicing DataFrame list to get post-COVID data

concatenated_df_pre = pd.concat(df_ls_pre) # Concatenating pre-COVID DataFrames
concatenated_df_post = pd.concat(df_ls_post) # Concatenating post-COVID DataFrames

variations = []

for col in df_ls[0].columns[2:-1]: # Iterating over columns excluding first two and last one

    vals_pre = concatenated_df_pre[col].values
    vals_post = concatenated_df_post[col].values

    median_pre = np.median(vals_pre) # Calculating median of pre-COVID values
    median_post = np.median(vals_post) # Calculating median of post-COVID values

    variation = calc_variation(median_pre,median_post)

    variations.append(variation)

plot_variations(variations,custom_y_tick=True) # Plotting variations



The analysis suggest that the feature that change the most is the one of column 77 (X_77), since its value change more the 200%. But there are still variables that have a variation of 100% like X_6, X_10 and others. The are also variable with a less evident but still notable change like X_17 and X_35, which have a variation of more or less 70%

Now we perform the same analysis but considering each sector indipendently.

In [None]:
def count_high_variations(count_dict,variations, threshold=75):
    """
    Count the number of times each feature (column) has a variation greater than or equal to a threshold.

    Parameters:
    - variations (list): List of variations for each feature.
    - threshold (float): The threshold for considering a variation as high. Default is 75.

    Returns:
    - count_dict (dict): Dictionary where keys are feature names and values are the counts of high variations.
    """

    for idx, variation in enumerate(variations):
        if variation >= threshold:
            column_name = df_ls[0].columns[2:-1][idx] # Extract the column name corresponding to the index
            count_dict[column_name] = count_dict.get(column_name, 0) + 1 # Increment the count for the column in the dictionary

    return count_dict

count_dict = {}
for sector in range(1,7):#Iterating over sectors and repeat the process
    
    variations = []
    means_pre = []
    means_post = []

    concatenated_df_pre_masked = concatenated_df_pre[concatenated_df_pre[83] == sector]
    concatenated_df_post_masked = concatenated_df_post[concatenated_df_post[83] == sector]

    for col in df_ls[0].columns[2:-1]:

        vals_pre = concatenated_df_pre_masked[col].values
        vals_post = concatenated_df_post_masked[col].values

        median_pre = np.median(vals_pre)
        median_post = np.median(vals_post)

        variation = calc_variation(median_pre,median_post)
        variations.append(variation) 

    count_high_variations(count_dict,variations)
    plot_variations(variations,custom_y_tick=True,title="(sector"+str(sector)+")")

sorted_count_dict = dict(sorted(count_dict.items(), key=lambda item: item[1], reverse=True)) #counts variations over the threshold

In [None]:
sorted_count_dict

In order to rank the financial indicator based on their change, we count how many times the have a change of over the 75%. As it is possible to see above, the features X_10,X_16,X_26,X_36,X_68 have a significant change in for all the sectors. While X_18,X_11,X_21,X_45,X_59 changes signficantly for 5 out of 6 secotrs. 
Talking about the ablsolute values of the variations, X_51 has a large variation in Sector 1 and in Sector 4. In Sector 1 also X_22 changes more than 100%. X_77 in Sector 2 and Secotr 3 is characterized by a big change. Finally, in Sector 4 X_29 and X_33 change more than 200%.

## Sector Classifier: Training and Evaluation

In this section we train two Sector classifiers. In particular we consider XGBoost and RandomForest, since they are two kind of models particularly suitable for tabular data analysis. In particular, XGBoost has proven to achieve better performances but we also consider RandomForest for its better interpretability.

We have to perform some preprocessing steps before the training. In particular we have to drop the columns that w/o the sector, because in those cases we would miss the label. Fortunately, they are only 8. Then we have to trasnform the  labels values from [1,2,3,4,5,6] to [0,1,2,3,4,5]. Finally we have to encode the variable indicating the country into a categoorical one.

We obviously split the dataset into train and test, and we measure models accuracy on the test set.

FOr the tuning of the Hyperparameters of the models, we have used a Random Search.

In [None]:
df_tot = pd.concat(df_ls)

rows_dropped = len(df_tot[df_tot[83] == 'm'])  #Count number of rows with 'm' values of column with labels
df_tot.drop(df_tot[df_tot[83] == 'm'].index, inplace=True) # Drop rows with 'm' values of column with labels
print("dropped rows w/o label:",rows_dropped)

df_tot[83] = df_tot[83]-1 # Decrement the values in column 83 by 1
df_tot[83] = df_tot[83].astype(int) # Convert column 83 to integer type
label_encoder = LabelEncoder() # Initialize label encoder
df_tot[0] = label_encoder.fit_transform(df_tot[0]) # Encode values in column 0

df = df_tot.iloc[:,1:] # Extract features for model training

### XGBoost

In [None]:
from sklearn.model_selection import train_test_split

def train_test_split_df(df, test_size=0.2, random_state=None):
    # Separate features and labels
    X = df.iloc[:, :-1]  # Features (all columns except the last one)
    y = df.iloc[:, -1]   # Labels (last column)

    # Split the dataset into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=random_state)

    return X_train, X_test, y_train, y_test


X_train, X_test, y_train, y_test = train_test_split_df(df)

In [None]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split
import xgboost as xgb


# Define the parameter grid for RandomizedSearchCV
param_grid = {
    'n_estimators': [200,300,400,500],
    'max_depth': [10,15,20,50],
    'learning_rate': [0.005, 0.01, 0.05, 0.1],
}

# Create a XGBoost classifier
xgb_clf = xgb.XGBClassifier(objective='multi:softmax', num_class=6, random_state=42)

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=xgb_clf, param_distributions=param_grid,
                                   n_iter=15, scoring='accuracy', cv=3, verbose=3, random_state=42)

# Perform RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print("Best Parameters:", best_params)



In [None]:
# Train the final model with the best parameters on the entire training set
final_model = xgb.XGBClassifier(objective='multi:softmax', num_class=6, random_state=42, **best_params)
final_model.fit(X_train, y_train)

In [None]:
# Evaluate the final model on the test set
y_pred = final_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
# Calculate recall, precision, and F1 score
recall = recall_score(y_test, y_pred, average='macro')
precision = precision_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
cm = confusion_matrix(y_test, y_pred)

# Print metrics
print("Accuracy of XGB:", accuracy)
print("Recall of XGB:", recall)
print("Precision of XGB:", precision)
print("F1 Score of XGB:", f1)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g', xticklabels=['Class 1', 'Class 2', 'Class 3', 'Class 4', 'Class 5', 'Class 6'], yticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 4', 'Class 5', 'Class 6'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix of XGB')
plt.show()

The XGBoost reaches an accuracy of almost the 80%, which is a quite good result. Looking at the values of the precision and recall and at the confusion matrix, we notice that precision is higher than recall. It means that while it is good at identifying true positives, it may miss some of the actual positive instances present in the data. In other words, the model is conservative in its predictions, preferring to make fewer positive predictions overall but ensuring that those it does make are more likely to be correct.

### RandomForest

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define the parameter grid for RandomizedSearchCV
param_grid = {
    'n_estimators': [200, 300, 400, 500],
    'max_depth': [10, 15, 20, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Create a Random Forest classifier
rf_clf = RandomForestClassifier(class_weight='balanced',random_state=42)

# Create the RandomizedSearchCV object
random_search_rf = RandomizedSearchCV(estimator=rf_clf, param_distributions=param_grid,
                                      n_iter=15, scoring='accuracy', cv=3, verbose=3, random_state=42)

# Perform RandomizedSearchCV on the training data
random_search_rf.fit(X_train, y_train)

# Get the best parameters
best_params_rf = random_search_rf.best_params_
print("Best Parameters for Random Forest:", best_params_rf)


In [None]:
# Initialize the Random Forest classifier with the best parameters
final_model = RandomForestClassifier(**best_params_rf, random_state=42)

# Train the Random Forest classifier on the training data
final_model.fit(X_train, y_train)

In [None]:
# Evaluate the final model on the test set
y_pred = final_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
# Calculate recall, precision, and F1 score
recall = recall_score(y_test, y_pred, average='macro')
precision = precision_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
cm = confusion_matrix(y_test, y_pred)

# Print metrics
print("Accuracy of Random Forest:", accuracy)
print("Recall of Random Forest:", recall)
print("Precision of Random Forest:", precision)
print("F1 Score of Random Forest:", f1)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g', xticklabels=['Class 1', 'Class 2', 'Class 3', 'Class 4', 'Class 5', 'Class 6'], yticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 4', 'Class 5', 'Class 6'])
plt.ylabel('Actual')
plt.title('Confusion Matrix of Random Forest')
plt.show()

In this case we have that the accuracy is lower than the one of the XGBoost: 71%. We still observe an higher precision with respect to the recall, which leads to the same discussion we made for the XGBoost.

From the analysis we have observed that XGBoost ia a better.

Finally, even if the XGBoost is built to handle and class imbalance and that we made the Random Forest able to deal with it (class_weight='balanced'), it is clear that the fact that the majority of the dataset is populated by companies of the Sector 3 influences a lot the classification. Probably a more balanced dataset could lead to a performance improvement.