In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, RobustScaler, Normalizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

### Load data

In [3]:
train = pd.read_csv('train.csv').drop(['id', 'Row#'], axis = 1)
test = pd.read_csv('test.csv').drop(['id', 'Row#'], axis = 1)


### Changing feature names into some better ones

In [4]:
def change_columns(data):
    data = data.rename(columns={
        # 'Row#': 'ID',
        'clonesize': 'Clone Size',
        'honeybee': 'Honeybee Activity',
        'bumbles': 'Bumblebee Activity',
        'andrena': 'Andrena Bee Activity',
        'osmia': 'Osmia Bee Activity',
        'MaxOfUpperTRange': 'Max Upper Temperature',
        'MinOfUpperTRange': 'Min Upper Temperature',
        'AverageOfUpperTRange': 'Avg Upper Temperature',
        'MaxOfLowerTRange': 'Max Lower Temperature',
        'MinOfLowerTRange': 'Min Lower Temperature',
        'AverageOfLowerTRange': 'Avg Lower Temperature',
        'RainingDays': 'Rainy Days',
        'AverageRainingDays': 'Avg Rainy Days',
        'fruitset': 'Fruit Set',
        'fruitmass': 'Fruit Mass',
        'seeds': 'Seed Count',
        'yield': 'Yield'
    })

    return data

test = change_columns(test)
train = change_columns(train)
train.head()


Unnamed: 0,Clone Size,Honeybee Activity,Bumblebee Activity,Andrena Bee Activity,Osmia Bee Activity,Max Upper Temperature,Min Upper Temperature,Avg Upper Temperature,Max Lower Temperature,Min Lower Temperature,Avg Lower Temperature,Rainy Days,Avg Rainy Days,Fruit Set,Fruit Mass,Seed Count,Yield
0,12.5,0.25,0.25,0.25,0.75,69.7,42.1,58.2,50.2,24.3,41.2,16.0,0.26,0.477941,0.423927,34.043022,6079.08526
1,25.0,0.0,0.0,0.0,0.0,86.0,52.0,71.9,62.0,30.0,50.8,3.77,0.06,0.334594,0.354415,27.735098,2946.92602
2,25.0,0.5,0.25,0.75,0.63,86.0,52.0,71.9,62.0,30.0,50.8,34.0,0.56,0.468192,0.417915,34.838815,5323.30034
3,25.0,0.5,0.25,0.38,0.75,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.500558,0.427431,35.134955,6157.05484
4,12.5,0.25,0.38,0.5,0.75,77.4,46.8,64.7,55.8,27.0,45.8,24.0,0.39,0.520181,0.464067,37.02918,6992.82314


In [5]:
def FE(df): 
    # Renaming columns to match the new names
    df["Total Bee Density"] = df['Honeybee Activity'] + df['Bumblebee Activity'] + df['Andrena Bee Activity'] + df['Osmia Bee Activity']
    df["Bee to Clone Ratio"] = df["Total Bee Density"] / df["Clone Size"]
    df["Max Temp Difference"] = df["Max Upper Temperature"] - df["Min Lower Temperature"]
    df["Mass Set"] = df["Fruit Mass"] * df["Fruit Set"]
   
    return df

# Apply the function to the train and test datasets
train = FE(train)
test = FE(test)

#### Model

In [6]:
X = train.select_dtypes('number').drop(columns=['Yield'])
# X = train[['Fruit Set', 'Mass Seed', 'Seed Count']]
y = train['Yield']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Define the RandomForestRegressor model
rf_model = RandomForestRegressor(
    max_depth=9, 
    random_state=42, 
    criterion='absolute_error', 
    n_estimators=300, 
    n_jobs=-1
)

# Fit the model
rf_model.fit(X, y)

# Make predictions
y_pred = rf_model.predict(X_test)

# Calculate MAE and RMSE
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R2 Score: {r2}')


KeyboardInterrupt: 

In [107]:
ty_pred = rf_model.predict(test)
sub = pd.read_csv('sample_submission.csv')
sub['yield'] = ty_pred
sub.to_csv('Exam.csv', index=False)