## Pre-processing of CSV files

In [None]:
import pandas as pd
import re
import string

def load_data(file_paths, encodings=['utf-8', 'ISO-8859-1']):
    dataframes = []
    for file_path in file_paths:
        for encoding in encodings:
            try:
                df = pd.read_csv(file_path, encoding=encoding)
                dataframes.append(df)
                print(f"Successfully loaded {file_path} with encoding {encoding}. Shape: {df.shape}")
                break  # Exit the loop if reading is successful
            except UnicodeDecodeError:
                continue  # Try the next encoding if there's an error
    combined_df = pd.concat(dataframes, ignore_index=True)
    print(f"Combined DataFrame shape: {combined_df.shape}")
    return combined_df

def clean_text(text):
    if pd.isnull(text):
        return ''
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)  # Remove punctuation
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    return text.strip()

def preprocess_dataframe(df):
    print(f"DataFrame shape before dropping NaNs: {df.shape}")
    df.dropna(inplace=True)  # Remove empty rows
    print(f"DataFrame shape after dropping NaNs: {df.shape}")
    df['concerns'] = df['concerns'].apply(clean_text)
    df['anything else'] = df['anything else'].apply(clean_text)
    return df

file_paths = ['data_file_1_02.csv', 'data_file_2_08.csv', 'data_file_3_08.csv']
df = load_data(file_paths)
df = preprocess_dataframe(df)

# Encode labels
label_mapping = {'AC': 0, 'PC': 1, 'TC': 2, 'NC': 3}
df['concerns category'] = df['concerns category'].map(label_mapping)
df['anything else category'] = df['anything else category'].map(label_mapping)

# Create combined text and category columns
concerns_df = df[['concerns', 'concerns category']].rename(columns={'concerns': 'combined_text', 'concerns category': 'combined_category'})
anything_else_df = df[['anything else', 'anything else category']].rename(columns={'anything else': 'combined_text', 'anything else category': 'combined_category'})

# Concatenate both dataframes
final_df = pd.concat([concerns_df, anything_else_df], ignore_index=True)

# Save the final cleaned dataset
final_df.to_csv('cleaned_data.csv', index=False)

print(f"Final DataFrame shape: {df.shape}")

Successfully loaded data_file_1_02.csv with encoding ISO-8859-1. Shape: (1279, 4)
Successfully loaded data_file_2_08.csv with encoding ISO-8859-1. Shape: (718, 4)
Successfully loaded data_file_3_08.csv with encoding ISO-8859-1. Shape: (621, 4)
Combined DataFrame shape: (2618, 4)
DataFrame shape before dropping NaNs: (2618, 4)
DataFrame shape after dropping NaNs: (1324, 4)
Final DataFrame shape: (1324, 4)
