In [71]:
import pandas as pd 
import numpy as np 


## Read the data 

In [105]:
test_path = '/home/abdulla-ovais/Desktop/vnb-foml-2024-hackathon./foml24_hackathon/test.csv'
train_path = '/home/abdulla-ovais/Desktop/vnb-foml-2024-hackathon./foml24_hackathon/train.csv'

pd1 = pd.read_csv(test_path)
pd2 = pd.read_csv(train_path)


pd1.head()
pd2.head()

Unnamed: 0,UID,AgriculturalPostalZone,AgricultureZoningCode,CropFieldConfiguration,CropSpeciesVariety,CultivatedAndWildArea,CultivatedAreaSqft1,DistrictId,FarmClassification,FarmEquipmentArea,...,TotalTaxAssessed,TotalValue,TownId,TypeOfIrrigationSystem,UndergroundStorageSqft,ValuationYear,WaterAccessPoints,WaterAccessPointsCalc,WaterReservoirCount,Target
0,12998,291674,0.0,,3.0,,1136.0,1.0,,,...,8636.716,456255.6,118.0,,,2018.0,2.0,2.0,,high
1,20860,164397,28.0,,4.0,,2083.0,1.0,,,...,18464.292,996887.6,24.0,1.0,,2018.0,3.0,3.0,1.0,medium
2,75725,616532,0.0,,2.0,,922.0,1.0,,,...,15594.568,1043780.0,9.0,1.0,,2018.0,1.0,1.0,,medium
3,106521,942111,43.0,,7.0,,,1.0,,,...,8494.618,435734.8,114.0,,,2020.0,3.0,3.0,,low
4,99467,475557,38.0,,3.0,,2225.0,3.0,,0.0,...,13517.284,885400.0,6.0,,,2020.0,4.0,4.0,,medium


## Preprocess the data, removing missing values

In [106]:
def null_values(df):
        missing_values = df.isnull().sum()
        missing_values_percent = 100 * df.isnull().sum() / len(df)
        missing_values_table = pd.concat([missing_values, missing_values_percent], axis=1)
        missing_values_table_ren_columns = missing_values_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        missing_values_table_ren_columns = missing_values_table_ren_columns[
            missing_values_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(missing_values_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return missing_values_table_ren_columns

In [107]:
miss_values = null_values(pd2)
miss_values.head(20)

Dataframe has 58 columns.
There are 55 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
FarmClassification,112552,100.0
PerimeterGuardPlantsArea,112525,100.0
UndergroundStorageSqft,112512,99.9
FieldZoneLevel,112512,99.9
HarvestStorageSqft,112457,99.9
HasGreenHouse,112305,99.8
CropFieldConfiguration,112274,99.7
FieldConstructionType,112239,99.7
CultivatedAndWildArea,112027,99.5
FieldShadeCover,111701,99.2


In [108]:
pd2 = pd2.loc[:, pd2.isnull().mean() < 0.8]

## after removing 80% missing values

In [109]:
print(pd2)

           UID  AgriculturalPostalZone  AgricultureZoningCode  \
0        12998                  291674                    0.0   
1        20860                  164397                   28.0   
2        75725                  616532                    0.0   
3       106521                  942111                   43.0   
4        99467                  475557                   38.0   
...        ...                     ...                    ...   
112564   26998                  380364                   41.0   
112565  135304                  369784                   24.0   
112566  153756                  414700                   37.0   
112567  129907                  566488                   19.0   
112568  103354                  540570                   28.0   

        CropSpeciesVariety  CultivatedAreaSqft1  DistrictId  \
0                      3.0               1136.0         1.0   
1                      4.0               2083.0         1.0   
2                      2.0    

In [110]:
from sklearn.preprocessing import LabelEncoder


# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the Target column
pd2['Target'] = label_encoder.fit_transform(pd2['Target'])

print(pd2)
pd2.head()

           UID  AgriculturalPostalZone  AgricultureZoningCode  \
0        12998                  291674                    0.0   
1        20860                  164397                   28.0   
2        75725                  616532                    0.0   
3       106521                  942111                   43.0   
4        99467                  475557                   38.0   
...        ...                     ...                    ...   
112564   26998                  380364                   41.0   
112565  135304                  369784                   24.0   
112566  153756                  414700                   37.0   
112567  129907                  566488                   19.0   
112568  103354                  540570                   28.0   

        CropSpeciesVariety  CultivatedAreaSqft1  DistrictId  \
0                      3.0               1136.0         1.0   
1                      4.0               2083.0         1.0   
2                      2.0    

Unnamed: 0,UID,AgriculturalPostalZone,AgricultureZoningCode,CropSpeciesVariety,CultivatedAreaSqft1,DistrictId,FarmEquipmentArea,FarmVehicleCount,FarmingCommunityId,FarmingUnitCount,...,TotalCultivatedAreaSqft,TotalTaxAssessed,TotalValue,TownId,TypeOfIrrigationSystem,ValuationYear,WaterAccessPoints,WaterAccessPointsCalc,WaterReservoirCount,Target
0,12998,291674,0.0,3.0,1136.0,1.0,,,,1.0,...,1136.0,8636.716,456255.6,118.0,,2018.0,2.0,2.0,,0
1,20860,164397,28.0,4.0,2083.0,1.0,,,,1.0,...,2083.0,18464.292,996887.6,24.0,1.0,2018.0,3.0,3.0,1.0,2
2,75725,616532,0.0,2.0,922.0,1.0,,,,1.0,...,922.0,15594.568,1043780.0,9.0,1.0,2018.0,1.0,1.0,,2
3,106521,942111,43.0,7.0,,1.0,,,,3.0,...,3202.0,8494.618,435734.8,114.0,,2020.0,3.0,3.0,,1
4,99467,475557,38.0,3.0,2225.0,3.0,0.0,2.0,,,...,2225.0,13517.284,885400.0,6.0,,2020.0,4.0,4.0,,2


## checking the columns which are having NaN values and replace it with mode values of that columns

In [121]:

# Check for columns with NaN values and count them
missing_values = pd2.isna()

# Filter out columns with zero missing values for better clarity
missing_columns = missing_values[missing_values > 0]

print("Columns with NaN values:")
print(missing_columns)


Columns with NaN values:
        UID AgriculturalPostalZone AgricultureZoningCode CropSpeciesVariety  \
0       NaN                    NaN                   NaN                NaN   
1       NaN                    NaN                   NaN                NaN   
2       NaN                    NaN                   NaN                NaN   
3       NaN                    NaN                   NaN                NaN   
4       NaN                    NaN                   NaN                NaN   
...     ...                    ...                   ...                ...   
112564  NaN                    NaN                   NaN                NaN   
112565  NaN                    NaN                   NaN                NaN   
112566  NaN                    NaN                   NaN                NaN   
112567  NaN                    NaN                   NaN                NaN   
112568  NaN                    NaN                   NaN                NaN   

       CultivatedAreaSqft1

In [118]:
pd2.shape

(112569, 35)

In [111]:
for column in pd2.columns:
    mode_value = pd2[column].mode()[0]  # Find the mode of the column
    pd2[column].fillna(mode_value, inplace=True) 

In [115]:
#pd2.fillna(0, inplace=True)
print(pd2)

           UID  AgriculturalPostalZone  AgricultureZoningCode  \
0        12998                  291674                    0.0   
1        20860                  164397                   28.0   
2        75725                  616532                    0.0   
3       106521                  942111                   43.0   
4        99467                  475557                   38.0   
...        ...                     ...                    ...   
112564   26998                  380364                   41.0   
112565  135304                  369784                   24.0   
112566  153756                  414700                   37.0   
112567  129907                  566488                   19.0   
112568  103354                  540570                   28.0   

        CropSpeciesVariety  CultivatedAreaSqft1  DistrictId  \
0                      3.0               1136.0         1.0   
1                      4.0               2083.0         1.0   
2                      2.0    

In [113]:
pd2.head()

Unnamed: 0,UID,AgriculturalPostalZone,AgricultureZoningCode,CropSpeciesVariety,CultivatedAreaSqft1,DistrictId,FarmEquipmentArea,FarmVehicleCount,FarmingCommunityId,FarmingUnitCount,...,TotalCultivatedAreaSqft,TotalTaxAssessed,TotalValue,TownId,TypeOfIrrigationSystem,ValuationYear,WaterAccessPoints,WaterAccessPointsCalc,WaterReservoirCount,Target
0,12998,291674,0.0,3.0,1136.0,1.0,0.0,2.0,2038.0,1.0,...,1136.0,8636.716,456255.6,118.0,1.0,2018.0,2.0,2.0,1.0,0
1,20860,164397,28.0,4.0,2083.0,1.0,0.0,2.0,2038.0,1.0,...,2083.0,18464.292,996887.6,24.0,1.0,2018.0,3.0,3.0,1.0,2
2,75725,616532,0.0,2.0,922.0,1.0,0.0,2.0,2038.0,1.0,...,922.0,15594.568,1043780.0,9.0,1.0,2018.0,1.0,1.0,1.0,2
3,106521,942111,43.0,7.0,1200.0,1.0,0.0,2.0,2038.0,3.0,...,3202.0,8494.618,435734.8,114.0,1.0,2020.0,3.0,3.0,1.0,1
4,99467,475557,38.0,3.0,2225.0,3.0,0.0,2.0,2038.0,1.0,...,2225.0,13517.284,885400.0,6.0,1.0,2020.0,4.0,4.0,1.0,2


## xgboost classifier

In [122]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb

# Load your dataset (replace 'your_file.csv' with your actual file)
df = pd2
X = df.drop(columns='Target')  # Replace 'target_column' with the actual column name for target
y = df['Target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the XGBoost model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Make predictions and calculate accuracy
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.60


## Random forest classifier

In [114]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

"""# Function to handle missing values by dropping rows with NaN values
def handle_missing_values(df):
    return df.dropna()  # Drop rows with NaN values"""

# Function to load data and prepare the features and target variables
def prepare_data(pd2):
    #df = handle_missing_values(df)  # Handle missing values first
    X = pd2.drop(columns='Target')  # Drop the 'Target' column to get features
    y = pd2['Target']  # Get the target column
    return X, y

# Function to split the data into training and testing sets
def split_data(X, y, test_size=0.3, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test

# Function to initialize and train the RandomForest model
def train_model(X_train, y_train):
    model = RandomForestClassifier(random_state=42)  # Initialize the model
    model.fit(X_train, y_train)  # Train the model with training data
    return model

# Function to make predictions and calculate accuracy
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)  # Make predictions
    accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy
    return accuracy

# Main function to tie everything together
def main(pd2):
    # Step 1: Prepare the data
    X, y = prepare_data(pd2)
    
    # Step 2: Split the data into training and testing sets
    X_train, X_test, y_train, y_test = split_data(X, y)
    
    # Step 3: Train the model
    model = train_model(X_train, y_train)
    
    # Step 4: Evaluate the model
    accuracy = evaluate_model(model, X_test, y_test)
    
    # Output the accuracy
    print(f"Accuracy: {accuracy:.2f}")

# Sample usage:
# Assuming 'df' is your DataFrame containing the data
main(df)


Accuracy: 0.60
