# Классификация фруктов (Logistic regression, SVM, Random Forest)


## Импорты


In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

## Генерация данных


In [10]:
np.random.seed(42)
n_samples = 1000

fruits = [
    "apple",
    "orange",
    "banana",
    "grape",
    "kiwi",
    "papaya",
    "blueberry",
    "watermelon",
]

data = {"weight": [], "size": [], "color": [], "shape": [], "fruit": []}

for _ in range(n_samples):
    fruit = np.random.choice(fruits)
    data["fruit"].append(fruit)

    if fruit == "apple":
        data["weight"].append(np.random.normal(150, 20))  # 150g ± 20g
        data["size"].append(np.random.normal(7, 1))  # 7cm ± 1cm
        data["color"].append(np.random.choice(["red", "green", "yellow"]))
        data["shape"].append(np.random.choice(["round", "oval"]))
    elif fruit == "orange":
        data["weight"].append(np.random.normal(200, 30))
        data["size"].append(np.random.normal(8, 1.5))
        data["color"].append(np.random.choice(["orange", "yellow"]))
        data["shape"].append(np.random.choice(["round", "oval"]))
    elif fruit == "banana":
        data["weight"].append(np.random.normal(120, 15))
        data["size"].append(np.random.normal(10, 2))
        data["color"].append(np.random.choice(["yellow", "green"]))
        data["shape"].append(np.random.choice(["elongated"]))
    elif fruit == "grape":
        data["weight"].append(np.random.normal(5, 1))
        data["size"].append(np.random.normal(1, 0.5))
        data["color"].append(np.random.choice(["purple", "green", "red"]))
        data["shape"].append(np.random.choice(["round"]))
    elif fruit == "kiwi":
        data["weight"].append(np.random.normal(75, 10))
        data["size"].append(np.random.normal(5, 0.5))
        data["color"].append("brown")
        data["shape"].append(np.random.choice(["oval"]))
    elif fruit == "papaya":
        data["weight"].append(np.random.normal(1000, 100))
        data["size"].append(np.random.normal(30, 5))
        data["color"].append(np.random.choice(["yellow", "brown"]))
        data["shape"].append(np.random.choice(["oval", "elongated"]))
    elif fruit == "blueberry":
        data["weight"].append(np.random.normal(2, 0.5))
        data["size"].append(np.random.normal(1, 0.5))
        data["color"].append(np.random.choice(["blue", "purple"]))
        data["shape"].append(np.random.choice(["round"]))
    elif fruit == "watermelon":
        data["weight"].append(np.random.normal(5000, 500))
        data["size"].append(np.random.normal(50, 5))
        data["color"].append(np.random.choice(["green"]))
        data["shape"].append(np.random.choice(["round", "oval"]))

# Dataframe
df = pd.DataFrame(data)

df["weight"] = df["weight"].clip(lower=1)  # No negative weights
df["size"] = df["size"].clip(lower=1)  # No negative sizes

# Shuffle
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Display
print("Dataset Preview:")
display(df.head(10))

# Data summary
print("\nDataset Summary:")
display(df.describe())

# Save to CSV
df.to_csv("fruit_data.csv", index=False)
print("Dataset saved as 'fruit_data.csv'")

Dataset Preview:


Unnamed: 0,weight,size,color,shape,fruit
0,138.962837,9.558199,green,round,apple
1,5.173342,1.121977,purple,round,grape
2,4.304956,1.0,green,round,grape
3,1.990848,1.198622,blue,round,blueberry
4,4.283134,1.77934,purple,round,grape
5,5.521122,1.322608,red,round,grape
6,5519.119796,50.431838,green,round,watermelon
7,136.498574,8.283789,green,elongated,banana
8,874.349311,34.066026,brown,oval,papaya
9,176.052943,5.993233,yellow,oval,orange



Dataset Summary:


Unnamed: 0,weight,size
count,1000.0,1000.0
mean,857.854348,14.531007
std,1655.706558,16.722983
min,1.0,1.0
25%,60.089268,3.970126
50%,129.408174,7.151529
75%,899.443294,23.453937
max,6747.938895,65.309735


Dataset saved as 'fruit_data.csv'


## Предварительная обработка данных (Preprocessing)

In [11]:
le_color = LabelEncoder()
le_shape = LabelEncoder()
le_fruit = LabelEncoder()

df = pd.read_csv("fruit_data.csv")
df["color"] = le_color.fit_transform(df["color"])
df["shape"] = le_shape.fit_transform(df["shape"])
df["fruit"] = le_fruit.fit_transform(df["fruit"])

print("Encoded Dataset Preview:")
display(df.head(10))

# Define features (X) and target (y)
X = df[["weight", "size", "color", "shape"]]
y = df["fruit"]

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

Encoded Dataset Preview:


Unnamed: 0,weight,size,color,shape,fruit
0,138.962837,9.558199,2,2,0
1,5.173342,1.121977,4,2,3
2,4.304956,1.0,2,2,3
3,1.990848,1.198622,0,2,2
4,4.283134,1.77934,4,2,3
5,5.521122,1.322608,5,2,3
6,5519.119796,50.431838,2,2,7
7,136.498574,8.283789,2,0,1
8,874.349311,34.066026,1,1,6
9,176.052943,5.993233,6,1,5


## Создание и тренировка моделей.
Предвадительные результаты без оптимизаций гипер-параметров, без кросс-валидации

In [12]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model 1: Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_scaled, y_train)
lr_pred = lr.predict(X_test_scaled)
print("\nLogistic Regression Results:")
print(classification_report(y_test, lr_pred, target_names=le_fruit.classes_))

# Model 2: Support Vector Machine (SVM)
svm = SVC(kernel="rbf", random_state=42)  # RBF kernel for non-linear separation
svm.fit(X_train_scaled, y_train)
svm_pred = svm.predict(X_test_scaled)
print("\nSVM Results:")
print(classification_report(y_test, svm_pred, target_names=le_fruit.classes_))

# Model 3: Random Forest
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train_scaled, y_train)
rf_pred = rf.predict(X_test_scaled)
print("\nRandom Forest Results:")
print(classification_report(y_test, rf_pred, target_names=le_fruit.classes_))


Logistic Regression Results:
              precision    recall  f1-score   support

       apple       0.67      0.08      0.14        25
      banana       1.00      1.00      1.00        23
   blueberry       0.43      0.68      0.53        19
       grape       0.79      0.67      0.72        33
        kiwi       0.87      1.00      0.93        33
      orange       0.52      0.93      0.67        14
      papaya       1.00      1.00      1.00        34
  watermelon       1.00      1.00      1.00        19

    accuracy                           0.80       200
   macro avg       0.78      0.79      0.75       200
weighted avg       0.81      0.80      0.77       200


SVM Results:
              precision    recall  f1-score   support

       apple       0.75      0.72      0.73        25
      banana       1.00      1.00      1.00        23
   blueberry       1.00      0.68      0.81        19
       grape       0.85      1.00      0.92        33
        kiwi       1.00      1.00 