In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [27]:
soil_df = pd.read_csv("../data/soil_data.csv")
soil_df.shape

(3109, 32)

In [28]:
soil_df.head()

Unnamed: 0,fips,lat,lon,elevation,slope1,slope2,slope3,slope4,slope5,slope6,...,CULTRF_LAND,CULTIR_LAND,CULT_LAND,SQ1,SQ2,SQ3,SQ4,SQ5,SQ6,SQ7
0,1001,32.536382,-86.64449,63,0.0419,0.2788,0.2984,0.2497,0.1142,0.017,...,56.293411,1.014811,57.308224,1,1,1,1,1,1,2
1,1005,31.87067,-85.405456,146,0.0158,0.1868,0.5441,0.2424,0.0106,0.0003,...,72.578804,1.828159,74.40696,3,2,1,1,1,1,1
2,1003,30.659218,-87.746067,52,0.0746,0.437,0.4415,0.0469,0.0,0.0,...,59.843639,2.996914,62.840553,3,2,1,2,1,1,1
3,1007,33.015893,-87.127148,93,0.0144,0.1617,0.3714,0.3493,0.0898,0.0134,...,1.916593,0.00833,1.924924,3,2,1,1,1,1,1
4,1009,33.977448,-86.567246,198,0.005,0.0872,0.2799,0.3576,0.1477,0.1037,...,1.891909,0.027488,1.919397,3,2,1,1,1,1,1


In [29]:
(soil_df.groupby('fips').size() > 1).values.any()

np.False_

In [None]:
import pandas as pd
import time

def read_csv_with_non_empty_rows_and_progress(filename):
    """Reads a CSV file, filters out rows with only empty values, and prints progress while reading.

    Args:
        filename (str): The name of the CSV file.

    Returns:
        pandas.DataFrame: A DataFrame containing the data from the CSV file, excluding rows with only empty values.
    """

    filtered_df = pd.DataFrame()  # Initialize an empty DataFrame

    # Read the CSV file into a DataFrame, chunk by chunk
    chunksize = 100000  # Adjust chunk size as needed
    for chunk in pd.read_csv(filename, chunksize=chunksize):
        # Filter out rows with only empty values
        filtered_chunk = chunk.dropna(how='any')

        # Print progress
        print(f"Processed {len(filtered_chunk)} rows so far")

        # Append filtered chunk to the DataFrame
        filtered_df = pd.concat([filtered_df, filtered_chunk], ignore_index=True)
        print(filtered_df.shape)
        if filtered_df.shape[0]>100000:
            break
        del chunk
        del filtered_chunk
    return filtered_df

# Example usage:
filename = "../data/train_timeseries.csv"
filtered_df = read_csv_with_non_empty_rows_and_progress(filename)

print(filtered_df)

In [None]:
filtered_df.isnull().sum()

In [None]:
filtered_df.to_csv('drought_data.csv',index = False)

In [None]:
####################################################################################################################

In [30]:
import pandas as pd
df = pd.read_csv('drought_data.csv')
df.shape

(114267, 21)

In [31]:
df.dtypes

fips             int64
date            object
PRECTOT        float64
PS             float64
QV2M           float64
T2M            float64
T2MDEW         float64
T2MWET         float64
T2M_MAX        float64
T2M_MIN        float64
T2M_RANGE      float64
TS             float64
WS10M          float64
WS10M_MAX      float64
WS10M_MIN      float64
WS10M_RANGE    float64
WS50M          float64
WS50M_MAX      float64
WS50M_MIN      float64
WS50M_RANGE    float64
score          float64
dtype: object

In [32]:
df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month
df['day'] = pd.DatetimeIndex(df['date']).day
df['score'] = df['score'].round().astype(int)

In [33]:
df['fips'].nunique()

129

In [35]:
df['score'].round().value_counts()

score
0    62165
1    19982
2    14131
3    10209
4     5251
5     2529
Name: count, dtype: int64

In [36]:
df = df.merge(soil_df, left_on='fips', right_on='fips')
df.shape

(114267, 55)

In [37]:
independent_variables = df.drop('score', axis=1)
independent_variables = independent_variables.drop('fips', axis=1)
independent_variables = independent_variables.drop('date', axis=1)

In [38]:
target = df['score']
target.value_counts()

score
0    62165
1    19982
2    14131
3    10209
4     5251
5     2529
Name: count, dtype: int64

In [39]:
X_train, X_test, y_train, y_test = train_test_split(independent_variables, target, test_size=0.2, random_state=0)

In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

model = RandomForestClassifier(n_estimators=10) # n_estimators is the hyperparameter
model.fit(X_train, y_train)


In [41]:
ypred = model.predict(X_test)

In [42]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
print(confusion_matrix(y_test, ypred))

[[11818   395   131    33     9     0]
 [ 1963  1700   353    77    13     3]
 [  788   520  1226   246    32     5]
 [  478   160   423   871   107     7]
 [  215    71    83   179   415    35]
 [   62    19    16    15    65   321]]


In [43]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print(accuracy_score(y_test, ypred))
print(precision_score(y_test, ypred, average='weighted'))
print(recall_score(y_test, ypred, average='weighted'))

0.715454625010939
0.6943545117990415
0.715454625010939


In [1]:
import pandas as pd

# Path to the large CSV file
input_file_path = r"C:\Users\shrey\Downloads\archive\test_timeseries\test_timeseries.csv"
# Path to save the smaller CSV file
output_file_path = 'smaller_test_file.csv'

# Read the first 1 million rows of the large CSV file
df = pd.read_csv(input_file_path, nrows=1000000)

# Save the subset to a new CSV file
df.to_csv(output_file_path, index=False)

print(f'Successfully saved the first 1 million rows to {output_file_path}')


Successfully saved the first 1 million rows to smaller_test_file.csv
