In [1]:
from zipfile import ZipFile
import os
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [15]:
def leaderboard():
    with open('leaderboard.txt') as f:
        data = f.read()
    data = re.findall('[\d]+\.([\d]+.[\d]+)', data)
    return [float(i[5:]) for i in data]

In [3]:
if not os.path.exists('dataset'):
    with ZipFile('Participants_Data_GGSH_Solution_Notebook.zip') as f:
        f.extractall('dataset')

In [4]:
train = pd.read_csv('dataset/India_train.csv')
test = pd.read_csv('dataset/India_test.csv')
submission = pd.read_csv('dataset/submission.csv')

In [5]:
print('Train columns: ')
for i,j in enumerate(train.columns):
    print(f"{i+1:>3}. {j} | {train[j].dtype}")

Train columns: 
  1. Year | int64
  2. State Name | object
  3. Dist Name | object
  4. COTTON AREA (1000 ha) | float64
  5. COTTON PRODUCTION (1000 tons) | float64
  6. COTTON YIELD (Kg per ha) | int64
  7. JANUARY PERCIPITATION (Millimeters) | float64
  8. FEBRUARY PERCIPITATION (Millimeters) | float64
  9. MARCH PERCIPITATION (Millimeters) | float64
 10. APRIL PERCIPITATION (Millimeters) | float64
 11. MAY PERCIPITATION (Millimeters) | float64
 12. JUNE PERCIPITATION (Millimeters) | float64
 13. JULY PERCIPITATION (Millimeters) | float64
 14. AUGUST PERCIPITATION (Millimeters) | float64
 15. SEPTEMBER PERCIPITATION (Millimeters) | float64
 16. OCTOBER PERCIPITATION (Millimeters) | float64
 17. NOVEMBER PERCIPITATION (Millimeters) | float64
 18. DECEMBER PERCIPITATION (Millimeters) | float64
 19. JANUARY MINIMUM (Centigrate) | float64
 20. FEBRUARY MINIMUM (Centigrate) | float64
 21. MARCH MINIMUM (Centigrate) | float64
 22. APRIL MINIMUM (Centigrate) | float64
 23. MAY MINIMUM (Cent

In [6]:
print('\nTo Predict columns: ')
for i, j in enumerate(submission.columns):
    print(f"{i+1}. {j}", end='')
    if j in train.columns:
        print(' ✅')
        y_column = j
    else:
        print(' ❌')


To Predict columns: 
1. Yield (Pounds/ Harvested Area) ❌
2. COTTON YIELD (Kg per ha) ✅
3. Cotton_Price[Dollar/ton] ❌


In [7]:
print(f'\n{any(train.columns == test.columns) = }')


any(train.columns == test.columns) = True


In [8]:
train1 = train.select_dtypes(['int64', 'float64'])
test1 = test.select_dtypes(['int64', 'float64'])

In [9]:
for i in train1.columns:
    train1[i] = train1[i].fillna(train1[i].mean())
    test1[i] = test1[i].fillna(test1[i].mean())

In [10]:
x_train, y_train, x_test, y_test = (train1.drop(y_column, axis=1), train1[y_column],
                                    test1.drop(y_column, axis=1), test1[y_column])

In [11]:
x_scaler = StandardScaler()
x_train = x_scaler.fit_transform(x_train)
x_test = x_scaler.transform(x_test)

In [25]:
model = LinearRegression()
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
print(f'\nScore: {score}')


Score: 0.0


In [26]:
mse = mean_squared_error(y_test, model.predict(x_test))
print(f'MSE: {mse} | {(mse < 13147049.32475) = }')

MSE: 18085839.986804508 | (mse < 13147049.32475) = False


In [31]:
# I know this does'nt give correct Rank
for i, j in enumerate(leaderboard()):
    if mse < j:
        print(f'Rank: {i+1}')
        print(f'MSE: {mse:,.5f} | Leaderboard: {j:,}')
        break
else:
    print('High MSE')

Rank: 3
MSE: 18,085,839.98680 | Leaderboard: 22,205,271.40202
