In [3]:
import pandas as pd

# Load the dataset
data = pd.read_csv('info.Allergen_Status_of_Food_Products.csv' , keep_default_na=False, na_values="")

# Display the first few rows of the dataset
print("Dataset Preview:")
print(data.head())

# Display dataset info
print("\nDataset Information:")
print(data.info())

# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())


Dataset Preview:
          Food Product Main Ingredient Sweetener Fat/Oil Seasoning  \
0       Almond Cookies         Almonds     Sugar  Butter     Flour   
1       Almond Cookies         Almonds     Sugar  Butter     Flour   
2  Chicken Noodle Soup   Chicken broth      None    None      Salt   
3  Chicken Noodle Soup   Chicken broth      None    None      Salt   
4       Cheddar Cheese          Cheese      None    None      Salt   

                Allergens  Price ($)  Customer rating (Out of 5) Prediction  
0   Almonds, Wheat, Dairy      10.15                         3.1   Contains  
1   Almonds, Wheat, Dairy       6.17                         4.5   Contains  
2  Chicken, Wheat, Celery      19.65                         4.1   Contains  
3  Chicken, Wheat, Celery      17.48                         4.7   Contains  
4                   Dairy      10.83                         3.7   Contains  

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399 entries, 0 to 398


In [19]:
# Check the percentage of missing values for each column
missing_percentage = data.isnull().mean() * 100
print("Percentage of Missing Values:\n", missing_percentage)

# Strategy: Drop columns with >50% missing values, fill others
threshold = 50  # Define a threshold for acceptable missing values
columns_to_drop = missing_percentage[missing_percentage > threshold].index
print("\nColumns to drop (more than 50% missing):", columns_to_drop)

# Drop columns with too many missing values
data.drop(columns=columns_to_drop, inplace=True)

# Fill missing values for numerical columns with mean
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].mean())

# Fill missing values for categorical columns with mode
categorical_cols = data.select_dtypes(include=['object']).columns
data[categorical_cols] = data[categorical_cols].fillna(data[categorical_cols].mode().iloc[0])

# Verify no missing values remain
print("\nMissing values after handling:")
print(data.isnull().sum())


Percentage of Missing Values:
 Food Product                  0.0
Main Ingredient               0.0
Sweetener                     0.0
Fat/Oil                       0.0
Seasoning                     0.0
Allergens                     0.0
Price ($)                     0.0
Customer rating (Out of 5)    0.0
Prediction                    0.0
dtype: float64

Columns to drop (more than 50% missing): Index([], dtype='object')


IndexError: single positional indexer is out-of-bounds

In [7]:
from sklearn.preprocessing import LabelEncoder

# Identify categorical columns
categorical_cols = data.select_dtypes(include=['object']).columns
print("Categorical Columns:", categorical_cols)

# Apply Label Encoding to categorical columns
label_encoders = {}  # Store encoders for potential decoding later
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le  # Save the encoder

# Verify encoding
print("\nEncoded Dataset Preview:")
print(data.head(100))

# Check data types to ensure all columns are numerical
print("\nData Types After Encoding:")
print(data.dtypes)


Categorical Columns: Index([], dtype='object')

Encoded Dataset Preview:
    Food Product  Main Ingredient  Sweetener  Fat/Oil  Seasoning  Allergens  \
0              0                0          8        3         57          0   
1              0                0          8        3         57          0   
2             71               23          6       21        139          1   
3             71               23          6       21        139          1   
4             57               20          6       21        139          2   
..           ...              ...        ...      ...        ...        ...   
95           107               47          6       21        175          9   
96           101               63          6        3          9          9   
97           161               34          6        3        143         10   
98            96               34          5       21         22         10   
99            96               34          5       21     

In [10]:
# Check class distribution in the target column
target_column = 'Prediction'  # Replace with the actual target column name
print("Class Distribution in Target Column:")
print(data[target_column].value_counts())


Class Distribution in Target Column:
Prediction
0    256
1    143
Name: count, dtype: int64


In [11]:
# Calculate scale_pos_weight
scale_pos_weight = len(data[data['Prediction'] == 0]) / len(data[data['Prediction'] == 1])
print(f"Scale Pos Weight: {scale_pos_weight}")

# Pass this parameter when initializing XGBoost
import xgboost as xgb

xgb_model = xgb.XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42)


Scale Pos Weight: 1.7902097902097902


In [12]:
print("Data Types:")
print(data.dtypes)

# Ensure no object or string types remain in the dataset
assert data.select_dtypes(include=['object']).shape[1] == 0, "Categorical columns are still present!"


Data Types:
Food Product                    int32
Main Ingredient                 int32
Sweetener                       int32
Fat/Oil                         int32
Seasoning                       int32
Allergens                       int32
Price ($)                     float64
Customer rating (Out of 5)    float64
Prediction                      int32
dtype: object


In [13]:
from sklearn.preprocessing import StandardScaler

# Identify numerical columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns

# Apply scaling if necessary
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])


In [14]:
# Detect outliers using IQR
for col in numerical_cols:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = data[(data[col] < Q1 - 1.5 * IQR) | (data[col] > Q3 + 1.5 * IQR)]
    print(f"Outliers detected in {col}: {len(outliers)}")


Outliers detected in Price ($): 0
Outliers detected in Customer rating (Out of 5): 0


In [15]:
correlation_matrix = data.corr()
print("Correlation Matrix:")
print(correlation_matrix)

# Drop features with correlation > 0.9
threshold = 0.9
correlated_features = set(
    correlation_matrix.columns[abs(correlation_matrix).gt(threshold).sum() > 1]
)
data = data.drop(columns=correlated_features)
print(f"Features dropped due to high correlation: {correlated_features}")


Correlation Matrix:
                            Food Product  Main Ingredient  Sweetener  \
Food Product                    1.000000         0.523192  -0.004210   
Main Ingredient                 0.523192         1.000000  -0.059914   
Sweetener                      -0.004210        -0.059914   1.000000   
Fat/Oil                         0.167671         0.106601  -0.210390   
Seasoning                       0.098401         0.005409  -0.122046   
Allergens                       0.036094         0.019861   0.179053   
Price ($)                      -0.010911        -0.074400  -0.031587   
Customer rating (Out of 5)     -0.085068         0.004258  -0.025218   
Prediction                      0.048272        -0.094763  -0.141715   

                             Fat/Oil  Seasoning  Allergens  Price ($)  \
Food Product                0.167671   0.098401   0.036094  -0.010911   
Main Ingredient             0.106601   0.005409   0.019861  -0.074400   
Sweetener                  -0.210390  -0

In [16]:
X = data.drop(columns=['Prediction'])  # Replace 'Prediction' with your actual target column
y = data['Prediction']

print("Features Shape:", X.shape)
print("Target Shape:", y.shape)


Features Shape: (399, 8)
Target Shape: (399,)


In [17]:
# Save the processed dataset as a CSV file
processed_file_path = 'processed_dataset.csv'
data.to_csv(processed_file_path, index=False)
print(f"Processed dataset saved to {processed_file_path}")


Processed dataset saved to processed_dataset.csv
