In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
df = pd.read_csv('../머신러닝/comproducts.csv')

In [3]:
df = df.drop(['key', 'name', 'subhead', 'description'], axis=1)


In [4]:
df = df.drop(df.query('rating_count < 10').index, axis=0).reset_index(drop=True)
df
#5개 인덱스 날림

Unnamed: 0,brand,rating,rating_count,ingredients
0,bj,3.7,208,"CREAM, SKIM MILK, LIQUID SUGAR (SUGAR, WATER),..."
1,bj,4.0,127,"CREAM, SKIM MILK, LIQUID SUGAR (SUGAR, WATER),..."
2,bj,4.7,130,"CREAM, LIQUID SUGAR (SUGAR, WATER), SKIM MILK,..."
3,bj,3.6,70,"CREAM, SKIM MILK, LIQUID SUGAR (SUGAR, WATER),..."
4,bj,4.5,281,"CREAM, SKIM MILK, WATER, LIQUID SUGAR (SUGAR, ..."
...,...,...,...,...
231,breyers,4.0,28,"MILK, CORN SYRUP, SUGAR, BROWN SUGAR, SOYBEAN ..."
232,breyers,4.7,18,"MILK, WATER, CARAMEL SWIRL, SUGAR, WATER, CORN..."
233,breyers,2.5,31,"MILK, CORN SYRUP, SUGAR, WHEAT FLOUR, BUTTER, ..."
234,breyers,3.2,38,"MILK, CORN SYRUP, ENRICHED WHEAT FLOUR, WHEAT ..."


In [5]:
#전처리 끝났으면 컬럼 날려줌
df = df.drop('rating_count', axis=1)


In [6]:
# list로 반환
import re
from nltk.stem import PorterStemmer
def process_ingredients(ingredients):
    ps = PorterStemmer()
    new_ingredients = re.sub(r'\(.*?\)', '', ingredients)
    new_ingredients = re.sub(r'CONTAINS:.*$', '', new_ingredients)
    new_ingredients = re.sub(r'contains:.*$', '', new_ingredients)
    new_ingredients = re.sub(r'\..*?:', ',', new_ingredients)
    new_ingredients = re.sub(r'( AND/OR )', ',', new_ingredients)
    new_ingredients = re.sub(r'( AND )', ',', new_ingredients)
    new_ingredients = new_ingredients.split(',')
    for i in range(len(new_ingredients)):
        new_ingredients[i] = new_ingredients[i].replace('†', '').replace('*', ' ').replace(')', '').replace('/', ' ')
        new_ingredients[i] = re.sub(r'^.+:', '', new_ingredients[i])
        new_ingredients[i] = ps.stem(new_ingredients[i].strip())
        if new_ingredients[i] == 'milk fat':
            new_ingredients[i] = 'milkfat'
    return new_ingredients

In [7]:
# 재료들 컬럼화
all_ingredients = set()

for row in df.iterrows():
    ingredients = process_ingredients(df.loc[row[0], 'ingredients'])
    for ingredient in ingredients:
        if ingredient not in all_ingredients:
            all_ingredients.add(ingredient)

all_ingredients.remove('')

In [9]:
y = df.loc[:, 'rating']
X = df.drop('rating', axis=1)
X

Unnamed: 0,brand,ingredients
0,bj,"CREAM, SKIM MILK, LIQUID SUGAR (SUGAR, WATER),..."
1,bj,"CREAM, SKIM MILK, LIQUID SUGAR (SUGAR, WATER),..."
2,bj,"CREAM, LIQUID SUGAR (SUGAR, WATER), SKIM MILK,..."
3,bj,"CREAM, SKIM MILK, LIQUID SUGAR (SUGAR, WATER),..."
4,bj,"CREAM, SKIM MILK, WATER, LIQUID SUGAR (SUGAR, ..."
...,...,...
231,breyers,"MILK, CORN SYRUP, SUGAR, BROWN SUGAR, SOYBEAN ..."
232,breyers,"MILK, WATER, CARAMEL SWIRL, SUGAR, WATER, CORN..."
233,breyers,"MILK, CORN SYRUP, SUGAR, WHEAT FLOUR, BUTTER, ..."
234,breyers,"MILK, CORN SYRUP, ENRICHED WHEAT FLOUR, WHEAT ..."


In [10]:
X['ingredients'] = X['ingredients'].apply(process_ingredients)

In [11]:
X['ingredients']

0      [cream, skim milk, liquid sugar, water, brown ...
1      [cream, skim milk, liquid sugar, water, sugar,...
2      [cream, liquid sugar, skim milk, water, sugar,...
3      [cream, skim milk, liquid sugar, water, corn s...
4      [cream, skim milk, water, liquid sugar, sugar,...
                             ...                        
231    [milk, corn syrup, sugar, brown sugar, soybean...
232    [milk, water, caramel swirl, sugar, water, cor...
233    [milk, corn syrup, sugar, wheat flour, butter,...
234    [milk, corn syrup, enriched wheat flour, wheat...
235    [milk, corn syrup, sugar, enriched wheat flour...
Name: ingredients, Length: 236, dtype: object

In [12]:
# list comprehension으로 원핫 인코딩
def one_hot_encode(ingredients, all_ingredients):
    encoding = pd.Series([1 if ingredient in ingredients else 0 for ingredient in all_ingredients], index=all_ingredients)
    return encoding

In [13]:
#함수 불러옴
encoded_ingredients = X['ingredients'].apply(lambda x: one_hot_encode(x, all_ingredients))

In [14]:
#Series 형태를 input할수 있게 df로
encoded_df = pd.DataFrame(encoded_ingredients, index=df.index)

In [15]:
X = encoded_df
X

Unnamed: 0,sugar,roasted peanut,strawberri,cherry juice concentr,pgpr,nonfat milk,coconut extract,water,organic caramelized sugar,natural flavor,...,powdered sugar,organic milk,fudge co,ground heath toffee piec,wheat bran,yellow 5,maltitol,soybean lecithin,reese's peanut butter cup,sodium acid pyrophosph
0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231,1,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
232,1,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
233,1,0,0,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
234,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
models = {
    'Ridge Regression': Ridge(random_state=42),
    'Lasso Regression': Lasso(random_state=42),
    'ElasticNet Regression': ElasticNet(random_state=42),
    'Decision Tree Regressor': DecisionTreeRegressor(random_state=42),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
    'AdaBoost Regressor': AdaBoostRegressor(random_state=42),
    'Support Vector Regressor': SVR(),
    'K-Neighbors Regressor': KNeighborsRegressor()
}

In [19]:
def evaluate_models(X, y, models, val=5):
    cv_results = {}
    for name, model in models.items():
        cv_scores = cross_val_score(model, X, y, cv=val, scoring='neg_mean_absolute_error')
        cv_results[name] = {
            'mean_score': -np.mean(cv_scores),
            'std_score': np.std(cv_scores)
        }
        cv_results[name]["mean_score"] = cv_results[name]["mean_score"]/y.mean()
        print(f'{name} - Cross-validation mean score (MAE): {cv_results[name]["mean_score"]:.4f}, std: {cv_results[name]["std_score"]:.4f}')
    return cv_results

In [20]:
print("Evaluating models without scaling:")
cv_results_no_scaling = evaluate_models(X, y, models)

Evaluating models without scaling:
Ridge Regression - Cross-validation mean score (MAE): 0.1414, std: 0.1313
Lasso Regression - Cross-validation mean score (MAE): 0.1116, std: 0.1121
ElasticNet Regression - Cross-validation mean score (MAE): 0.1116, std: 0.1121
Decision Tree Regressor - Cross-validation mean score (MAE): 0.1379, std: 0.1123
Random Forest Regressor - Cross-validation mean score (MAE): 0.1122, std: 0.0845
Gradient Boosting Regressor - Cross-validation mean score (MAE): 0.1155, std: 0.0953
AdaBoost Regressor - Cross-validation mean score (MAE): 0.1212, std: 0.0776
Support Vector Regressor - Cross-validation mean score (MAE): 0.1100, std: 0.1174
K-Neighbors Regressor - Cross-validation mean score (MAE): 0.1240, std: 0.1175


In [23]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [22]:
print("\nEvaluating models with scaling:")
cv_results_with_scaling = evaluate_models(X_scaled, y, models)


Evaluating models with scaling:
Ridge Regression - Cross-validation mean score (MAE): 0.2451, std: 0.2045
Lasso Regression - Cross-validation mean score (MAE): 0.1116, std: 0.1121
ElasticNet Regression - Cross-validation mean score (MAE): 0.1116, std: 0.1121
Decision Tree Regressor - Cross-validation mean score (MAE): 0.1379, std: 0.1123
Random Forest Regressor - Cross-validation mean score (MAE): 0.1122, std: 0.0845
Gradient Boosting Regressor - Cross-validation mean score (MAE): 0.1155, std: 0.0953
AdaBoost Regressor - Cross-validation mean score (MAE): 0.1212, std: 0.0776
Support Vector Regressor - Cross-validation mean score (MAE): 0.1083, std: 0.1029
K-Neighbors Regressor - Cross-validation mean score (MAE): 0.1128, std: 0.1186


In [25]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [36]:
gbr = GradientBoostingRegressor(random_state=42, max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=300)
gbr.fit(X_train_scaled, y_train)

In [37]:
def predict_rating(ingredients_list):
    processed_ingredients = process_ingredients(', '.join(ingredients_list))
    input_df = pd.DataFrame(columns=X.columns)
    input_df.loc[0] = 0  # Initialize with zeros
    for ingredient in processed_ingredients:
        if ingredient in input_df.columns:
            input_df.loc[0, ingredient] = 1

    # Scale the input data
    input_scaled = scaler.transform(input_df)

    # Predict the rating
    predicted_rating = gbr.predict(input_scaled)
    return predicted_rating[0]

In [38]:
input_ingredients = ['CREAM', 'SKIM MILK', 'WATER', 'LIQUID SUGAR', 'SUGAR', 'SOYBEAN OIL', 'EGG YOLKS', 'WHEAT FLOUR', 'COCONUT OIL', 'CHOCOLATE CHIPS', 'COCOA BUTTER', 'SOY LECITHIN', 'BUTTER', 'COCOA', 'CORN SYRUP', 'NATURAL FLAVOR', 'COCOA POWDER', 'EGGS', 'MOLASSES', 'SALT', 'GUAR GUM', 'BROWN SUGAR', 'CHOCOLATE LIQUOR', 'VANILLA EXTRACT', 'SOY LECITHIN', 'CARRAGEENAN', 'BAKING SODA']
predicted_rating = predict_rating(input_ingredients)
print(f'Predicted rating for ingredients : {predicted_rating:.4f}')

Predicted rating for ingredients : 4.5355
