# Gmo Food Classification

In [None]:
# import the libraries we will be using
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

In [None]:
# Combine the csv's into one large data frame called "df"

df1 = pd.read_csv('FOOD-DATA-GROUP1.csv')
df2 = pd.read_csv('FOOD-DATA-GROUP2.csv')
df3 = pd.read_csv('FOOD-DATA-GROUP3.csv')
df4 = pd.read_csv('FOOD-DATA-GROUP4.csv')
df5 = pd.read_csv('FOOD-DATA-GROUP5.csv')

df = pd.concat([df1, df2, df3 ,df4 ,df5],ignore_index=True)
df

In [None]:
# Drop the first two rows, they are useless and redundant
df = df.iloc[:, 2:]
df

In [None]:
# Make sure we don't have missing values
df.isnull().sum()

In [None]:
# Make 4 Quartiles, evenly distribute all rows, label and grade them A,B,C, D
df['Grade'] = pd.qcut(df['Nutrition Density'], q=4, labels=['D', 'C', 'B', 'A']) 
# I learned that these need to be reversed because the lowest score is assighned first. 
# We don't want to set thresh holds ourselves becaue the scale is not used to control the public but to only give a quicker 
# and easier way to compare that certain food will all other foods on the market.
df.head(20)

In [None]:
df['Grade'].value_counts()
# Ensure a even distribution

In [None]:
# Drop not needed columns
X = df.drop(columns=['food', 'Grade'])
y = df['Grade']

# Split into training and test sets 
# Use stratify to ensures grade balance in our testing model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Create a pipeline with scaling and logistic regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('log_reg', LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs'))
])

# Fit the model
pipeline.fit(X_train, y_train)

# Predict on test set
y_pred = pipeline.predict(X_test)

# Evaluate it
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, output_dict=False)

conf_matrix, class_report


In [None]:
correlation = df.select_dtypes(include='number').corr()
plt.figure(figsize=(14, 10))
sns.heatmap(correlation[['Nutrition Density']].sort_values(by='Nutrition Density', ascending=False), annot=True, cmap='coolwarm')
plt.title('Correlation with Nutrition Density')
plt.show()

###
###

In [None]:
# Make a copy of the full database.
df_R = df.copy()
df_R

In [None]:
# Drop not needed columns
df_R.drop(columns=['food'], inplace = True)

In [None]:
df_R.shape
# Subtract 2 because they a target variable
# When setting a minimum required known nutritional indicator it has to be less than 33 
#because that is all that is needed to find the nutritional density score and that would be easily classifyable in the scale.  
# We must also choose a minimum number that makes sence but I don't know how to find that number so we are going to set it at 10.

In [None]:
columns_to_drop = np.random.choice(df_R.columns, size = 23, replace = False)

In [None]:
df_random_drop = df_R.drop(columns = columns_to_drop)