In [1]:
import pandas as pd
import numpy as np
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
import sklearn.model_selection as xval
from sklearn.datasets import fetch_openml
import forestci as fci
from sklearn import metrics
from sklearn.metrics import r2_score
import statistics
import matplotlib.pyplot as plt 

Failed to import duecredit due to No module named 'duecredit'


In [2]:
def masked_mae(X_true, X_pred, mask):
    masked_diff = X_true[mask] - X_pred[mask]
    return np.mean(np.abs(masked_diff))
def generate_random_column_samples(column):
    col_mask = np.isnan(column)
    n_missing = np.sum(col_mask)
    if n_missing == len(column):
        #logging.warn("No observed values in column")
        return np.zeros_like(column)

    mean = np.nanmean(column)
    std = np.nanstd(column)

    if np.isclose(std, 0):
        return np.array([mean] * n_missing)
    else:
        return np.random.randn(n_missing) * std + mean

In [3]:
import warnings
from sklearn.utils import check_array

class Solver(object):
    def __init__(
            self,
            fill_method="zero",
            min_value=None,
            max_value=None,
            normalizer=None):
        self.fill_method = fill_method
        self.min_value = min_value
        self.max_value = max_value
        self.normalizer = normalizer

    def __repr__(self):
        return str(self)

    def __str__(self):
        field_list = []
        for (k, v) in sorted(self.__dict__.items()):
            if v is None or isinstance(v, (float, int)):
                field_list.append("%s=%s" % (k, v))
            elif isinstance(v, str):
                field_list.append("%s='%s'" % (k, v))
        return "%s(%s)" % (
            self.__class__.__name__,
            ", ".join(field_list))

    def _check_input(self, X):
        if len(X.shape) != 2:
            raise ValueError("Expected 2d matrix, got %s array" % (X.shape,))

    def _check_missing_value_mask(self, missing):
        if not missing.any():
            warnings.simplefilter("always")
            warnings.warn("Input matrix is not missing any values")
        if missing.all():
            raise ValueError("Input matrix must have some non-missing values")

    def _fill_columns_with_fn(self, X, missing_mask, col_fn):
        for col_idx in range(X.shape[1]):
            missing_col = missing_mask[:, col_idx]
            n_missing = missing_col.sum()
            if n_missing == 0:
                continue
            col_data = X[:, col_idx]
            fill_values = col_fn(col_data)
            if np.all(np.isnan(fill_values)):
                fill_values = 0
            X[missing_col, col_idx] = fill_values

    def fill(
            self,
            X,
            missing_mask,
            fill_method=None,
            inplace=False):
        """
        Parameters
        ----------
        X : np.array
            Data array containing NaN entries
        missing_mask : np.array
            Boolean array indicating where NaN entries are
        fill_method : str
            "zero": fill missing entries with zeros
            "mean": fill with column means
            "median" : fill with column medians
            "min": fill with min value per column
            "random": fill with gaussian samples according to mean/std of column
        inplace : bool
            Modify matrix or fill a copy
        """
        X = check_array(X, force_all_finite=False)

        if not inplace:
            X = X.copy()

        if not fill_method:
            fill_method = self.fill_method

        if fill_method not in ("zero", "mean", "median", "min", "random"):
            raise ValueError("Invalid fill method: '%s'" % (fill_method))
        elif fill_method == "zero":
            # replace NaN's with 0
            X[missing_mask] = 0
        elif fill_method == "mean":
            self._fill_columns_with_fn(X, missing_mask, np.nanmean)
        elif fill_method == "median":
            self._fill_columns_with_fn(X, missing_mask, np.nanmedian)
        elif fill_method == "min":
            self._fill_columns_with_fn(X, missing_mask, np.nanmin)
        elif fill_method == "random":
            self._fill_columns_with_fn(
                X,
                missing_mask,
                col_fn=generate_random_column_samples)
        return X

    def prepare_input_data(self, X):
        """
        Check to make sure that the input matrix and its mask of missing
        values are valid. Returns X and missing mask.
        """
        X = check_array(X, force_all_finite=False)
        if X.dtype != "f" and X.dtype != "d":
            X = X.astype(float)

        self._check_input(X)
        missing_mask = np.isnan(X)
        self._check_missing_value_mask(missing_mask)
        return X, missing_mask

    def clip(self, X):
        """
        Clip values to fall within any global or column-wise min/max constraints
        """
        X = np.asarray(X)
        if self.min_value is not None:
            X[X < self.min_value] = self.min_value
        if self.max_value is not None:
            X[X > self.max_value] = self.max_value
        return X

    def project_result(self, X):
        """
        First undo normalization and then clip to the user-specified min/max
        range.
        """
        X = np.asarray(X)
        if self.normalizer is not None:
            X = self.normalizer.inverse_transform(X)
        return self.clip(X)

    def solve(self, X, missing_mask):
        """
        Given an initialized matrix X and a mask of where its missing values
        had been, return a completion of X.
        """
        raise ValueError("%s.solve not yet implemented!" % (
            self.__class__.__name__,))

    def fit_transform(self, X, y=None):
        """
        Fit the imputer and then transform input `X`
        Note: all imputations should have a `fit_transform` method,
        but only some (like IterativeImputer in sklearn) also support inductive
        mode using `fit` or `fit_transform` on `X_train` and then `transform`
        on new `X_test`.
        """
        X_original, missing_mask = self.prepare_input_data(X)
        observed_mask = ~missing_mask
        X = X_original.copy()
        if self.normalizer is not None:
            X = self.normalizer.fit_transform(X)
        X_filled = self.fill(X, missing_mask, inplace=True)
        if not isinstance(X_filled, np.ndarray):
            raise TypeError(
                "Expected %s.fill() to return NumPy array but got %s" % (
                    self.__class__.__name__,
                    type(X_filled)))

        X_result = self.solve(X_filled, missing_mask)
        if not isinstance(X_result, np.ndarray):
            raise TypeError(
                "Expected %s.solve() to return NumPy array but got %s" % (
                    self.__class__.__name__,
                    type(X_result)))

        X_result = self.project_result(X=X_result)
        X_result[observed_mask] = X_original[observed_mask]
        return X_result

    def fit(self, X, y=None):
        """
        Fit the imputer on input `X`.
        Note: all imputations should have a `fit_transform` method,
        but only some (like IterativeImputer in sklearn) also support inductive
        mode using `fit` or `fit_transform` on `X_train` and then `transform`
        on new `X_test`.
        """
        raise ValueError(
            "%s.fit not implemented! This imputation algorithm likely "
            "doesn't support inductive mode. Only fit_transform is "
            "supported at this time." % (
                self.__class__.__name__,))

    def transform(self, X, y=None):
        """
        Transform input `X`.
        Note: all imputations should have a `fit_transform` method,
        but only some (like IterativeImputer in sklearn) also support inductive
        mode using `fit` or `fit_transform` on `X_train` and then `transform`
        on new `X_test`.
        """
        raise ValueError(
            "%s.transform not implemented! This imputation algorithm likely "
            "doesn't support inductive mode. Only %s.fit_transform is "
            "supported at this time." % (
                self.__class__.__name__, self.__class__.__name__))

In [4]:
from sklearn.utils.extmath import randomized_svd
from sklearn.utils import check_array

F32PREC = np.finfo(np.float32).eps


class SoftImpute(Solver):
    """
    Implementation of the SoftImpute algorithm from:
    "Spectral Regularization Algorithms for Learning Large Incomplete Matrices"
    by Mazumder, Hastie, and Tibshirani.
    """
    def __init__(
            self,
            shrinkage_value=None,
            convergence_threshold=0.001,
            max_iters=100,
            max_rank=None,
            n_power_iterations=1,
            init_fill_method="zero",
            min_value=None,
            max_value=None,
            normalizer=None,
            verbose=True):
        """
        Parameters
        ----------
        shrinkage_value : float
            Value by which we shrink singular values on each iteration. If
            omitted then the default value will be the maximum singular
            value of the initialized matrix (zeros for missing values) divided
            by 50.
        convergence_threshold : float
            Minimum ration difference between iterations (as a fraction of
            the Frobenius norm of the current solution) before stopping.
        max_iters : int
            Maximum number of SVD iterations
        max_rank : int, optional
            Perform a truncated SVD on each iteration with this value as its
            rank.
        n_power_iterations : int
            Number of power iterations to perform with randomized SVD
        init_fill_method : str
            How to initialize missing values of data matrix, default is
            to fill them with zeros.
        min_value : float
            Smallest allowable value in the solution
        max_value : float
            Largest allowable value in the solution
        normalizer : object
            Any object (such as BiScaler) with fit() and transform() methods
        verbose : bool
            Print debugging info
        """
        Solver.__init__(
            self,
            fill_method=init_fill_method,
            min_value=min_value,
            max_value=max_value,
            normalizer=normalizer)
        self.shrinkage_value = shrinkage_value
        self.convergence_threshold = convergence_threshold
        self.max_iters = max_iters
        self.max_rank = max_rank
        self.n_power_iterations = n_power_iterations
        self.verbose = verbose

    def _converged(self, X_old, X_new, missing_mask):
        # check for convergence
        old_missing_values = X_old[missing_mask]
        new_missing_values = X_new[missing_mask]
        difference = old_missing_values - new_missing_values
        ssd = np.sum(difference ** 2)
        old_norm = np.sqrt((old_missing_values ** 2).sum())
        # edge cases
        if old_norm == 0 or (old_norm < F32PREC and np.sqrt(ssd) > F32PREC):
            return False
        else:
            return (np.sqrt(ssd) / old_norm) < self.convergence_threshold

    def _svd_step(self, X, shrinkage_value, max_rank=None):
        """
        Returns reconstructed X from low-rank thresholded SVD and
        the rank achieved.
        """
        if max_rank:
            # if we have a max rank then perform the faster randomized SVD
            (U, s, V) = randomized_svd(
                X,
                max_rank,
                n_iter=self.n_power_iterations)
        else:
            # perform a full rank SVD using ARPACK
            (U, s, V) = np.linalg.svd(
                X,
                full_matrices=False,
                compute_uv=True)
        s_thresh = np.maximum(s - shrinkage_value, 0)
        rank = (s_thresh > 0).sum()
        s_thresh = s_thresh[:rank]
        U_thresh = U[:, :rank]
        V_thresh = V[:rank, :]
        S_thresh = np.diag(s_thresh)
        X_reconstruction = np.dot(U_thresh, np.dot(S_thresh, V_thresh))
        return X_reconstruction, rank

    def _max_singular_value(self, X_filled):
        # quick decomposition of X_filled into rank-1 SVD
        _, s, _ = randomized_svd(
            X_filled,
            1,
            n_iter=5)
        return s[0]

    def solve(self, X, missing_mask):
        X = check_array(X, force_all_finite=False)

        X_init = X.copy()

        X_filled = X
        observed_mask = ~missing_mask
        max_singular_value = self._max_singular_value(X_filled)
        if self.verbose:
            print("[SoftImpute] Max Singular Value of X_init = %f" % (
                max_singular_value))

        if self.shrinkage_value:
            shrinkage_value = self.shrinkage_value
        else:
            # totally hackish heuristic: keep only components
            # with at least 1/50th the max singular value
            shrinkage_value = max_singular_value / 50.0

        for i in range(self.max_iters):
            X_reconstruction, rank = self._svd_step(
                X_filled,
                shrinkage_value,
                max_rank=self.max_rank)
            X_reconstruction = self.clip(X_reconstruction)

            # print error on observed data
            if self.verbose:
                mae = masked_mae(
                    X_true=X_init,
                    X_pred=X_reconstruction,
                    mask=observed_mask)
                print(
                    "[SoftImpute] Iter %d: observed MAE=%0.6f rank=%d" % (
                        i + 1,
                        mae,
                        rank))

            converged = self._converged(
                X_old=X_filled,
                X_new=X_reconstruction,
                missing_mask=missing_mask)
            X_filled[missing_mask] = X_reconstruction[missing_mask]
            if converged:
                break
        if self.verbose:
            print("[SoftImpute] Stopped after iteration %d for lambda=%f" % (
                i + 1,
                shrinkage_value))

        return X_filled

# Reading raw data

In [6]:
mod_df = pd.read_csv('Tomato/Tomato/BAK/Tomato_UP_Arrival_Raw.csv')
price_df = pd.read_csv('Tomato/Tomato/BAK/Tomato_UP_Price_Raw.csv')

In [8]:
m = price_df['Price Date']
for  i in range(len(m)):
  date = m[i][:2]+'-'+m[i][3:6]+'-'+m[i][7:]
  m[i] = date

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  m[i] = date


In [9]:
price_df['Date'] = m

In [10]:
price_df= price_df.drop(['State','State Code','District Code','Market Code','Commodity','Commodity Code',' Variety','Grade',
                        'Min Price','Max Price','Price Date'], axis=1)

In [11]:
mod_df= mod_df.drop(['State','State Code','Commodity','Commodity Code'], axis=1)

In [12]:
mod_df.rename(columns = {'Volume Date':'Date'}, inplace = True)

In [13]:
df_new = pd.merge(mod_df,price_df,on=['Date','District','Market'])

In [16]:
mod_df = df_new

In [17]:
mandi_list = mod_df['Market'].unique()
print(mandi_list,len(mandi_list),sep='\n')

['Barabanki' 'Bareilly' 'Bijnaur' 'Sikanderabad' 'Faizabad' 'Ghaziabad'
 'Muradnagar' 'Gorakhpur' 'Lucknow' 'Shamli' 'Pilibhit' 'Jayas'
 'Shahjahanpur' 'Banda' 'Buland Shahr' 'Karvi' 'Farukhabad' 'Hapur' 'Orai'
 'Jhansi' 'Kanpur(Grain)' 'Raibareilly' 'Sitapur' 'Varanasi(Grain)'
 'Allahabad' 'Etawah' 'Unnao' 'Dadri' 'Badayoun' 'Lalitpur' 'Jalaun'
 'Khurja' 'Muzzafarnagar' 'Baraut' 'Khatauli' 'Divai' 'Saharanpur'
 'Shahpur' 'Bachranwa' 'Anandnagar' 'Nanuta' 'Rampurmaniharan'
 'Chutmalpur' 'Sultanpurchilkana' 'Gulavati' 'Jahangirabad' 'Purwa'
 'Bangarmau' 'Tulsipur' 'Utraula' 'Sahiyapur' 'Lakhimpur' 'Tilhar'
 'Sultanpur' 'Ajuha' 'Balrampur' 'Muradabad' 'Tundla' 'Bahraich' 'Anwala'
 'Soharatgarh' 'Jalalabad' 'Chandausi' 'Sikarpur' 'Shahabad' 'Bahedi'
 'Salon' 'Maigalganj' 'Devband' 'Lalganj' 'Baberu' 'Kiratpur' 'Sandi'
 'Thanabhawan' 'Vishalpur' 'Puwaha' 'Najibabad' 'Ait' 'Tanda' 'Nagina'
 'Jagnair' 'Baruwasagar' 'Moth' 'Anoop Shahar' 'Atarra' 'Madhoganj'
 'Gangoh' 'Parikshitgarh' 'Dhanura

In [18]:
n = mod_df['Date'].unique()
len(n)

3934

In [19]:
mod_df.head()

Unnamed: 0,District,Market,Arrivals,Date,Modal Price
0,Barabanki,Barabanki,1.2,01-Jan-2008,600.0
1,Bareilly,Bareilly,25.0,01-Jan-2008,690.0
2,Bijnor,Bijnaur,2.5,01-Jan-2008,690.0
3,Bulandshahar,Sikanderabad,3.0,01-Jan-2008,414.0
4,Faizabad,Faizabad,4.0,01-Jan-2008,400.0


In [20]:
mod_df = mod_df.sort_values(by = ['Market','Date'])
mod_df = mod_df.reset_index(drop=True)

In [23]:
mod_df.head()

Unnamed: 0,District,Market,Arrivals,Date,Modal Price
0,Auraiya,Achalda,3.0,01-Apr-2013,825.0
1,Auraiya,Achalda,20.0,01-Apr-2014,1150.0
2,Auraiya,Achalda,3.5,01-Apr-2015,1100.0
3,Auraiya,Achalda,0.2,01-Aug-2011,1025.0
4,Auraiya,Achalda,3.5,01-Aug-2012,2250.0


In [24]:
districts = mod_df['District'].unique()
markets = mod_df['Market'].unique()
len(districts),len(markets)

(69, 234)

In [25]:
mod_df.columns

Index(['District', 'Market', 'Arrivals', 'Date', 'Modal Price'], dtype='object')

In [26]:
mod_df.head()

Unnamed: 0,District,Market,Arrivals,Date,Modal Price
0,Auraiya,Achalda,3.0,01-Apr-2013,825.0
1,Auraiya,Achalda,20.0,01-Apr-2014,1150.0
2,Auraiya,Achalda,3.5,01-Apr-2015,1100.0
3,Auraiya,Achalda,0.2,01-Aug-2011,1025.0
4,Auraiya,Achalda,3.5,01-Aug-2012,2250.0


In [49]:
df=mod_df

month_array = []
day_array=[]
year_array=[]
for i in range(df.shape[0]):
    month_array.append(df["Date"][i][3:6])
    day_array.append(int(df['Date'][i][0:2]))
    year_array.append(int(df['Date'][i][7:]))
    
df['Month_name']= month_array
df['Day'] = day_array
df['Year']= year_array

month_names = np.array(df['Month_name'])
month_names = list(month_names) 
months = month_names.copy()

for i in range(len(months)):
  if month_names[i] == "Jan":
    months[i] = 1
  elif month_names[i] == "Feb":
    months[i] = 2
  elif month_names[i] == "Mar":
    months[i] = 3
  elif month_names[i] == "Apr":
    months[i] = 4
  elif month_names[i] == "May":
    months[i] = 5
  elif month_names[i] == "Jun":
    months[i] = 6
  elif month_names[i] == "Jul":
    months[i] = 7
  elif month_names[i] == "Aug":
    months[i] = 8
  elif month_names[i] == "Sep":
    months[i] = 9
  elif month_names[i] == "Oct":
    months[i] = 10
  elif month_names[i] == "Nov":
    months[i] = 11
  elif month_names[i] == "Dec":
    months[i] = 12

df['Month']=months

In [50]:
df.head()

Unnamed: 0,District,Market,Arrivals,Date,Modal Price,Month_name,Day,Year,Month
0,Auraiya,Achalda,3.0,01-Apr-2013,825.0,Apr,1,2013,4
1,Auraiya,Achalda,20.0,01-Apr-2014,1150.0,Apr,1,2014,4
2,Auraiya,Achalda,3.5,01-Apr-2015,1100.0,Apr,1,2015,4
3,Auraiya,Achalda,0.2,01-Aug-2011,1025.0,Aug,1,2011,8
4,Auraiya,Achalda,3.5,01-Aug-2012,2250.0,Aug,1,2012,8


In [51]:
df['Year'].unique()

array([2013, 2014, 2015, 2011, 2012, 2016, 2017, 2018, 2010, 2009, 2008])

In [52]:
mod_df = df.loc[df['Year'] != 2008]

In [53]:
mod_df = mod_df.reset_index(drop=True)

In [54]:
mod_df.shape

(188275, 9)

In [55]:
df = pd.read_csv('df_with_last_year_price_2009_2018.csv')
df = df.drop(['Unnamed: 0'],axis=1)
df = df.loc[df['Year'] != 2008]

In [56]:
dates = df['Date'].unique()

In [57]:
dates

array(['01-Jan-2009', '02-Jan-2009', '03-Jan-2009', ..., '29-Dec-2018',
       '30-Dec-2018', '31-Dec-2018'], dtype=object)

In [58]:
mod_df.columns

Index(['District', 'Market', 'Arrivals', 'Date', 'Modal Price', 'Month_name',
       'Day', 'Year', 'Month'],
      dtype='object')

In [59]:
mod_df =mod_df.drop(['Month_name', 'Day', 'Year', 'Month'],axis=1)

In [60]:
mod_df = mod_df.sort_values(by = ['Market','Date'])
mod_df = mod_df.reset_index(drop=True)

In [61]:
dates

array(['01-Jan-2009', '02-Jan-2009', '03-Jan-2009', ..., '29-Dec-2018',
       '30-Dec-2018', '31-Dec-2018'], dtype=object)

In [62]:
mod_df.shape

(188275, 5)

In [66]:
import math

new_df_list=[]
count=0
for market in markets:
    
    cur_df = mod_df.loc[mod_df['Market'] == market]
#     print(cur_df.head())
    cur_dates = np.array(cur_df['Date'])
    district = cur_df['District'].unique()[0]
#     print(district)
#     print(cur_dates)
    for date in dates:
        if date not in cur_dates:
            new_df_list.append( [district, market,math.nan,date
                                     ,math.nan])
        else:
            count +=1
            

    

In [67]:
len(new_df_list)

667129

In [68]:
count

187439

In [69]:
neww_df = pd.DataFrame(new_df_list, columns=['District', 'Market', 'Arrivals', 'Date', 'Modal_Price'])

In [70]:
neww_df.tail()

Unnamed: 0,District,Market,Arrivals,Date,Modal_Price
667124,Ghazipur,Yusufpur,,12-Dec-2018,
667125,Ghazipur,Yusufpur,,16-Dec-2018,
667126,Ghazipur,Yusufpur,,23-Dec-2018,
667127,Ghazipur,Yusufpur,,25-Dec-2018,
667128,Ghazipur,Yusufpur,,30-Dec-2018,


In [71]:
neww_df.shape

(667129, 5)

In [72]:
neww_df = neww_df.drop_duplicates(
  subset = ['Market', 'Date'],
  keep = 'first').reset_index(drop = True)

In [73]:
neww_df.shape

(667129, 5)

In [74]:
mod_df_ = mod_df.append(neww_df)

In [75]:
mod_df_.shape

(855404, 6)

In [76]:
mod_df_ = mod_df_.drop_duplicates(
  subset = ['Market', 'Date'],
  keep = 'first').reset_index(drop = True)

In [77]:
mod_df_.shape

(854568, 6)

In [78]:
mod_df_.head()

Unnamed: 0,District,Market,Arrivals,Date,Modal Price,Modal_Price
0,Auraiya,Achalda,3.0,01-Apr-2013,825.0,
1,Auraiya,Achalda,20.0,01-Apr-2014,1150.0,
2,Auraiya,Achalda,3.5,01-Apr-2015,1100.0,
3,Auraiya,Achalda,0.2,01-Aug-2011,1025.0,
4,Auraiya,Achalda,3.5,01-Aug-2012,2250.0,


In [79]:
mod_df.columns

Index(['District', 'Market', 'Arrivals', 'Date', 'Modal Price'], dtype='object')

In [80]:
arr1 = mod_df_['Modal Price'].to_numpy()
arr2 = mod_df_['Arrivals'].to_numpy()
arr1.shape,arr2.shape

((854568,), (854568,))

In [81]:
total_days = 3652 # from 01-01-2009 to 31-12-2018
arr1 = arr1.reshape(len(mandi_list),total_days)
arr2 = arr2.reshape(len(mandi_list),total_days)
arr1.shape,arr2.shape

((234, 3652), (234, 3652))

In [82]:
X_price_incomplete = arr1
X_arr_incomplete = arr2
X_price_incomplete.shape,X_arr_incomplete.shape

((234, 3652), (234, 3652))

In [83]:
year_window = 1
start_year =2009
total_years = 10 #2006-2016 

In [84]:
def getNumberofDays(year):
    p = pd.Period(f'{year}-{1}-1')
    number_of_days = p.is_leap_year+365
    #number_of_days = sum([pd.Period(f'{year}-{i}-1').daysinmonth for i in range(1,13)])
    return number_of_days

In [85]:
day_window_list = []
i=0
while(i<total_years):
    base_year  = start_year+i
    present_day_window = 0
    for j in range(year_window):
        if(i+j<total_years):
            year  = base_year+j
            present_day_window += getNumberofDays(year)
        else:
            break
    day_window_list.append(present_day_window)
    i += year_window

day_window_list

[365, 365, 365, 366, 365, 365, 365, 366, 365, 365]

In [86]:
day_window_slice_list = []
sum_days=0
for number_days in day_window_list:
    sum_days += number_days
    day_window_slice_list.append(sum_days)
day_window_slice_list.pop(-1)
day_window_slice_list

[365, 730, 1095, 1461, 1826, 2191, 2556, 2922, 3287]

In [87]:
print(X_price_incomplete.shape)
X_price_incomplete_list = np.split(X_price_incomplete,day_window_slice_list,axis=1)
for X_price_incomplete_sub in X_price_incomplete_list:
    print(X_price_incomplete_sub.shape)

(234, 3652)
(234, 365)
(234, 365)
(234, 365)
(234, 366)
(234, 365)
(234, 365)
(234, 365)
(234, 366)
(234, 365)
(234, 365)


In [88]:
price_min_value_list=[]
price_max_value_list=[]
for X_price in  X_price_incomplete_list:
    price_df = pd.DataFrame(data = X_price.reshape(-1,1))
    min_value = price_df.describe(percentiles=[0.03,0.97]).iloc[4][0]
    max_value = price_df.describe(percentiles=[0.03,0.97]).iloc[6][0]
    price_min_value_list.append(min_value)
    price_max_value_list.append(max_value)
price_min_value_list,price_max_value_list

([400.0, 400.0, 380.0, 400.0, 380.0, 400.0, 400.0, 400.0, 400.0, 400.0],
 [3500.0,
  3500.0,
  3350.0,
  3362.9000000000087,
  3500.0,
  3425.0,
  3300.0,
  3250.0,
  3250.0,
  3400.0])

In [89]:
X_price_filled_list = []
for i in range(len(X_price_incomplete_list)):
    X_price_filled = SoftImpute(convergence_threshold=0.001,init_fill_method="min",min_value=price_min_value_list[i],max_value=price_max_value_list[i]).fit_transform(X_price_incomplete_list[i])
    X_price_filled_list.append(X_price_filled)

[SoftImpute] Max Singular Value of X_init = 198089.638309
[SoftImpute] Iter 1: observed MAE=157.713565 rank=53
[SoftImpute] Iter 2: observed MAE=157.769783 rank=53
[SoftImpute] Iter 3: observed MAE=157.780730 rank=53
[SoftImpute] Iter 4: observed MAE=157.802560 rank=53
[SoftImpute] Iter 5: observed MAE=157.823400 rank=53
[SoftImpute] Iter 6: observed MAE=157.837643 rank=53
[SoftImpute] Iter 7: observed MAE=157.847431 rank=53
[SoftImpute] Iter 8: observed MAE=157.854167 rank=53
[SoftImpute] Iter 9: observed MAE=157.858916 rank=53
[SoftImpute] Iter 10: observed MAE=157.862459 rank=53
[SoftImpute] Iter 11: observed MAE=157.864971 rank=53
[SoftImpute] Iter 12: observed MAE=157.866836 rank=53
[SoftImpute] Iter 13: observed MAE=157.868494 rank=53
[SoftImpute] Iter 14: observed MAE=157.862229 rank=52
[SoftImpute] Stopped after iteration 14 for lambda=3961.792766
[SoftImpute] Max Singular Value of X_init = 193265.974605
[SoftImpute] Iter 1: observed MAE=156.496874 rank=53
[SoftImpute] Iter 2: 

In [90]:
X_price_filled_list = []
for i in range(len(X_price_incomplete_list)):
    X_price_filled = SoftImpute(convergence_threshold=0.001,init_fill_method="min",min_value=price_min_value_list[i],max_value=price_max_value_list[i]).fit_transform(X_price_incomplete_list[i])
    X_price_filled_list.append(X_price_filled)

[SoftImpute] Max Singular Value of X_init = 198089.638309
[SoftImpute] Iter 1: observed MAE=157.713565 rank=53
[SoftImpute] Iter 2: observed MAE=157.769783 rank=53
[SoftImpute] Iter 3: observed MAE=157.780730 rank=53
[SoftImpute] Iter 4: observed MAE=157.802560 rank=53
[SoftImpute] Iter 5: observed MAE=157.823400 rank=53
[SoftImpute] Iter 6: observed MAE=157.837643 rank=53
[SoftImpute] Iter 7: observed MAE=157.847431 rank=53
[SoftImpute] Iter 8: observed MAE=157.854167 rank=53
[SoftImpute] Iter 9: observed MAE=157.858916 rank=53
[SoftImpute] Iter 10: observed MAE=157.862459 rank=53
[SoftImpute] Iter 11: observed MAE=157.864971 rank=53
[SoftImpute] Iter 12: observed MAE=157.866836 rank=53
[SoftImpute] Iter 13: observed MAE=157.868494 rank=53
[SoftImpute] Iter 14: observed MAE=157.862229 rank=52
[SoftImpute] Stopped after iteration 14 for lambda=3961.792766
[SoftImpute] Max Singular Value of X_init = 193265.974605
[SoftImpute] Iter 1: observed MAE=156.496874 rank=53
[SoftImpute] Iter 2: 

In [91]:
X_price_filled = np.hstack(X_price_filled_list)
X_price_filled.shape

(234, 3652)

In [92]:
print(X_arr_incomplete.shape)
X_arr_incomplete_list = np.split(X_arr_incomplete,day_window_slice_list,axis=1)
for X_arr_incomplete_sub in X_arr_incomplete_list:
    print(X_arr_incomplete_sub.shape)

(234, 3652)
(234, 365)
(234, 365)
(234, 365)
(234, 366)
(234, 365)
(234, 365)
(234, 365)
(234, 366)
(234, 365)
(234, 365)


In [93]:
arr_min_value_list=[]
arr_max_value_list=[]
for X_arr in  X_arr_incomplete_list:
    arr_df = pd.DataFrame(data = X_arr.reshape(-1,1))
    min_value = arr_df.describe(percentiles=[0.03,0.97]).iloc[4][0]
    max_value = arr_df.describe(percentiles=[0.03,0.97]).iloc[6][0]
    arr_min_value_list.append(min_value)
    arr_max_value_list.append(max_value)
arr_min_value_list,arr_max_value_list

([0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.2, 0.2, 0.2, 0.2],
 [70.0, 71.63000000000102, 75.0, 75.0, 75.0, 86.0, 90.0, 95.0, 85.0, 80.0])

In [94]:
X_arr_filled_list = []
for i in range(len(X_arr_incomplete_list)):
    X_arr_filled = SoftImpute(convergence_threshold=0.001,init_fill_method="min",min_value=arr_min_value_list[i],max_value=arr_max_value_list[i]).fit_transform(X_arr_incomplete_list[i])
    X_arr_filled_list.append(X_arr_filled)

[SoftImpute] Max Singular Value of X_init = 4760.733869
[SoftImpute] Iter 1: observed MAE=5.249295 rank=33
[SoftImpute] Iter 2: observed MAE=5.249355 rank=33
[SoftImpute] Iter 3: observed MAE=5.249370 rank=33
[SoftImpute] Iter 4: observed MAE=5.249375 rank=33
[SoftImpute] Iter 5: observed MAE=5.249377 rank=33
[SoftImpute] Iter 6: observed MAE=5.249378 rank=33
[SoftImpute] Iter 7: observed MAE=5.249379 rank=33
[SoftImpute] Iter 8: observed MAE=5.249379 rank=33
[SoftImpute] Iter 9: observed MAE=5.249380 rank=33
[SoftImpute] Iter 10: observed MAE=5.249380 rank=33
[SoftImpute] Iter 11: observed MAE=5.249380 rank=33
[SoftImpute] Iter 12: observed MAE=5.249380 rank=33
[SoftImpute] Iter 13: observed MAE=5.249380 rank=33
[SoftImpute] Iter 14: observed MAE=5.249381 rank=33
[SoftImpute] Iter 15: observed MAE=5.249381 rank=33
[SoftImpute] Iter 16: observed MAE=5.249381 rank=33
[SoftImpute] Iter 17: observed MAE=5.249381 rank=33
[SoftImpute] Iter 18: observed MAE=5.249381 rank=33
[SoftImpute] Iter

In [95]:
X_arr_filled = np.hstack(X_arr_filled_list)
X_arr_filled.shape

(234, 3652)

In [96]:
X_price_filled.shape

(234, 3652)

In [97]:
np.save('Tomato_price_post_234_market_imp_v4.npy', X_price_filled)
np.save('Tomato_arrival_post_234_market_imp_v4.npy', X_arr_filled)

In [98]:
X_price_filled = X_price_filled.reshape(1,X_price_filled.size)[0]
X_arr_filled = X_arr_filled.reshape(1,X_arr_filled.size)[0]
X_arr_filled

array([ 3.       , 20.       ,  3.5      , ...,  0.2      ,  0.2225413,
        0.2      ])

In [99]:
mod_df_['Imp_Price'] = X_price_filled
mod_df_['Imp_Arrival'] = X_arr_filled
ArrivalNanFlag = np.isnan(mod_df_['Arrivals'])
PriceNanFlag = np.isnan(mod_df_['Modal_Price'])
mod_df_['PriceNanFlag'] = PriceNanFlag
mod_df_['ArrivalNanFlag'] = ArrivalNanFlag
mod_df_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 854568 entries, 0 to 854567
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   District        854568 non-null  object 
 1   Market          854568 non-null  object 
 2   Arrivals        187439 non-null  float64
 3   Date            854568 non-null  object 
 4   Modal Price     187439 non-null  float64
 5   Modal_Price     0 non-null       float64
 6   Imp_Price       854568 non-null  float64
 7   Imp_Arrival     854568 non-null  float64
 8   PriceNanFlag    854568 non-null  bool   
 9   ArrivalNanFlag  854568 non-null  bool   
dtypes: bool(2), float64(5), object(3)
memory usage: 53.8+ MB


In [100]:
mod_df_.tail()

Unnamed: 0,District,Market,Arrivals,Date,Modal Price,Modal_Price,Imp_Price,Imp_Arrival,PriceNanFlag,ArrivalNanFlag
854563,Ghazipur,Yusufpur,,12-Dec-2018,,,401.868199,0.318669,True,True
854564,Ghazipur,Yusufpur,,16-Dec-2018,,,400.0,0.2,True,True
854565,Ghazipur,Yusufpur,,23-Dec-2018,,,443.539297,0.2,True,True
854566,Ghazipur,Yusufpur,,25-Dec-2018,,,400.0,0.222541,True,True
854567,Ghazipur,Yusufpur,,30-Dec-2018,,,486.739315,0.2,True,True


In [101]:
mod_df_.shape

(854568, 10)

In [103]:
mod_df_ =mod_df_.drop(['Modal Price', 'Modal_Price','Arrivals'],axis=1)

In [105]:
mod_df_.head()

Unnamed: 0,District,Market,Date,Imp_Price,Imp_Arrival,PriceNanFlag,ArrivalNanFlag
0,Auraiya,Achalda,01-Apr-2013,825.0,3.0,True,False
1,Auraiya,Achalda,01-Apr-2014,1150.0,20.0,True,False
2,Auraiya,Achalda,01-Apr-2015,1100.0,3.5,True,False
3,Auraiya,Achalda,01-Aug-2011,1025.0,0.2,True,False
4,Auraiya,Achalda,01-Aug-2012,2250.0,3.5,True,False


In [107]:
mod_df_.to_csv('Tomato/Tomato/tomato_234mandis_post_impute.csv')

# Adding lat long data

In [108]:
df = pd.read_csv('Tomato/Tomato/tomato_234mandis_post_impute.csv')

In [109]:
month_array = []
day_array=[]
year_array=[]
for i in range(df.shape[0]):
    month_array.append(df["Date"][i][3:6])
    day_array.append(df['Date'][i][0:2])
    year_array.append(df['Date'][i][7:])
    
df['Month_name']= month_array
df['Day'] = day_array
df['Year']= year_array

month_names = np.array(df['Month_name'])
month_names = list(month_names) 
months = month_names.copy()

for i in range(len(months)):
  if month_names[i] == "Jan":
    months[i] = 1
  elif month_names[i] == "Feb":
    months[i] = 2
  elif month_names[i] == "Mar":
    months[i] = 3
  elif month_names[i] == "Apr":
    months[i] = 4
  elif month_names[i] == "May":
    months[i] = 5
  elif month_names[i] == "Jun":
    months[i] = 6
  elif month_names[i] == "Jul":
    months[i] = 7
  elif month_names[i] == "Aug":
    months[i] = 8
  elif month_names[i] == "Sep":
    months[i] = 9
  elif month_names[i] == "Oct":
    months[i] = 10
  elif month_names[i] == "Nov":
    months[i] = 11
  elif month_names[i] == "Dec":
    months[i] = 12

df['Month']=months

In [110]:
df = df.sort_values(by = ['District','Market', 'Year','Month','Day'])
df = df.reset_index(drop=True)
df.head()

Unnamed: 0.1,Unnamed: 0,District,Market,Date,Imp_Price,Imp_Arrival,PriceNanFlag,ArrivalNanFlag,Month_name,Day,Year,Month
0,189701,Agra,Achnera,01-Jan-2009,418.230143,0.2,True,True,Jan,1,2009,1
1,189702,Agra,Achnera,02-Jan-2009,408.698774,0.2,True,True,Jan,2,2009,1
2,189703,Agra,Achnera,03-Jan-2009,418.751435,0.227286,True,True,Jan,3,2009,1
3,189704,Agra,Achnera,04-Jan-2009,400.0,0.340054,True,True,Jan,4,2009,1
4,189705,Agra,Achnera,05-Jan-2009,400.0,0.2,True,True,Jan,5,2009,1


In [111]:
df= df.drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,District,Market,Date,Imp_Price,Imp_Arrival,PriceNanFlag,ArrivalNanFlag,Month_name,Day,Year,Month
0,Agra,Achnera,01-Jan-2009,418.230143,0.2,True,True,Jan,1,2009,1
1,Agra,Achnera,02-Jan-2009,408.698774,0.2,True,True,Jan,2,2009,1
2,Agra,Achnera,03-Jan-2009,418.751435,0.227286,True,True,Jan,3,2009,1
3,Agra,Achnera,04-Jan-2009,400.0,0.340054,True,True,Jan,4,2009,1
4,Agra,Achnera,05-Jan-2009,400.0,0.2,True,True,Jan,5,2009,1


In [112]:
latitude_20_markets = [('Rampur', 24.890090999999998),
             ('Muradabad', 28.8334982),
             ('Bareilly', 28.457876),
             ('Lakhimpur', 27.985060150000002),
             ('Basti', 26.724789),
             ('Shahjahanpur', 27.912633149999998),
             ('Faizabad', 26.63807555),
             ('Raibareilly', 26.230299),
             ('Lucknow', 26.8381),
             ('Aligarh', 27.87698975),
             ('Kasganj', 27.883846050000002),
             ('Bijnaur', 29.8575065),
             ('Ballia', 25.877932549999997),
             ('Muzzafarnagar', 29.4115745),
             ('Unnao', 26.57550365),
             ('Gazipur', 25.603508400000003),
             ('Gorakhpur', 26.6711433),
             ('Jaunpur', 25.7955927),
             ('Sultanpur', 26.242510850000002),
             ('Bahraich', 27.7336958)]
longitude_20_markets = [('Rampur', 83.73254274787365),
             ('Muradabad', 78.7732864),
             ('Bareilly', 79.40557093743058),
             ('Lakhimpur', 80.75384538357649),
             ('Basti', 82.79326865024002),
             ('Shahjahanpur', 79.74656294869826),
             ('Faizabad', 82.05902434378625),
             ('Raibareilly', 81.240891),
             ('Lucknow', 80.9346001),
             ('Aligarh', 78.13729027600994),
             ('Kasganj', 78.63489003747873),
             ('Bijnaur', 78.5598995),
             ('Ballia', 84.11995931460379),
             ('Muzzafarnagar', 77.7698696),
             ('Unnao', 80.61376177782856),
             ('Gazipur', 83.50745404887138),
             ('Gorakhpur', 83.36457243864551),
             ('Jaunpur', 82.48834097504385),
             ('Sultanpur', 82.29616931685918),
             ('Bahraich', 81.47732127661058)]

In [117]:
top_20_markets = [i[0] for i in longitude_20_markets]
df = df_new.loc[df_new['Market'].isin(top_20_markets)]
df = df.reset_index(drop=True)

In [118]:
dict_lat ={}
for i in latitude_20_markets:
    dict_lat[i[0]]=i[1]
    
dict_long ={}
for i in longitude_20_markets:
    dict_long[i[0]]=i[1]
    
long_list=[]
lat_list=[]

for i in range(df.shape[0]):
    market = df['Market'][i]
    long_list.append(dict_long[market])
    lat_list.append(dict_lat[market])
    

In [119]:
df['latitude']=lat_list
df['longitude']=long_list

In [120]:
df.head()

Unnamed: 0,District,Market,Arrivals,Date,Modal Price,latitude,longitude
0,Bareilly,Bareilly,25.0,01-Jan-2008,690.0,28.457876,79.405571
1,Bijnor,Bijnaur,2.5,01-Jan-2008,690.0,29.857506,78.5599
2,Faizabad,Faizabad,4.0,01-Jan-2008,400.0,26.638076,82.059024
3,Gorakhpur,Gorakhpur,70.0,01-Jan-2008,575.0,26.671143,83.364572
4,Lucknow,Lucknow,80.0,01-Jan-2008,450.0,26.8381,80.9346


# Adding weather data

In [121]:
df_env = pd.read_csv('Environmental_data_date_wise.csv')
df_env= df_env.drop(['Unnamed: 0'], axis=1)
df_env.head()

Unnamed: 0,longitude,latitude,time,Soil_Type,Humidity,Sunlight,Temperature,Avg_Precipitation,Last_Precipitation
0,77.0,30.0,2008-01-01 00:00:00,1.999985,60.846567,11827470.0,289.040273,3.72529,3.72529
1,77.0,30.0,2008-01-02 00:00:00,1.999985,62.179252,11778960.0,288.757838,3.72529,3.72529
2,77.0,30.0,2008-01-03 00:00:00,1.999985,62.779085,12047220.0,289.694981,3.72529,3.72529
3,77.0,30.0,2008-01-04 00:00:00,1.999985,61.948131,11954350.0,289.775984,3.72529,3.72529
4,77.0,30.0,2008-01-05 00:00:00,1.999985,64.646445,8912370.0,290.826166,5796.862176,3.72529


In [122]:
month_dict = {'01':'Jan','02':'Feb', '03':'Mar','04':'Apr','05': 'May','06': 'Jun','07':'Jul','08': 'Aug','09':'Sep','10': 'Oct','11':'Nov','12':'Dec'}

In [123]:
dates=[]
month_names=[]
years=[]
month=[]

for i in range(df_env.shape[0]):
  dates.append(int(df_env['time'][i][8:10]))
  month.append(int(df_env['time'][i][5:7]))
  month_names.append(month_dict[(df_env['time'][i][5:7])])
  years.append(df_env['time'][i][:4])
    
dates__ = []
for i in range(len(month)):
    date = (df_env['time'][i][8:10])+'-'+month_names[i]+'-'+years[i]
    dates__.append(date)
df_env['Day'] = dates
df_env['Month_name'] = month_names
df_env['Year'] = years
df_env['Month'] = month
df_env['Date'] = dates__

df_env = df_env.drop(['time'],axis=1)

target_lat= df_env['latitude'].unique()
target_long=df_env['longitude'].unique()
lat=df['latitude'].unique()
long=df['longitude'].unique()

In [124]:
lat_dict={}
for l in lat:
    lat_dict[l]=0



for l in lat:
    cur =100
    for t in target_lat:
        if abs(l-t)<cur:
            cur = abs(l-t)
            lat_dict[l]=t
            
long_dict={}
for l in long:
    long_dict[l]=0



for l in long:
    cur =100
    for t in target_long:
        if abs(l-t)<cur:
            cur = abs(l-t)
            long_dict[l]=t

updated_long=[]
updated_lat=[]

for i in range(df.shape[0]):
    lat = df['latitude'][i]
    long = df['longitude'][i]
    
    updated_long.append(long_dict[long])
    updated_lat.append(lat_dict[lat])
    
df['latitude_env'] = updated_lat
df['longitude_env'] = updated_long

df.head()

Unnamed: 0,District,Market,Arrivals,Date,Modal Price,latitude,longitude,latitude_env,longitude_env
0,Bareilly,Bareilly,25.0,01-Jan-2008,690.0,28.457876,79.405571,28.5,79.5
1,Bijnor,Bijnaur,2.5,01-Jan-2008,690.0,29.857506,78.5599,29.75,78.5
2,Faizabad,Faizabad,4.0,01-Jan-2008,400.0,26.638076,82.059024,26.75,82.0
3,Gorakhpur,Gorakhpur,70.0,01-Jan-2008,575.0,26.671143,83.364572,26.75,83.25
4,Lucknow,Lucknow,80.0,01-Jan-2008,450.0,26.8381,80.9346,26.75,81.0


In [125]:
lat=df['latitude_env']
long = df['longitude_env']

df['latitude'] = lat
df['longitude'] = long
df.head()

Unnamed: 0,District,Market,Arrivals,Date,Modal Price,latitude,longitude,latitude_env,longitude_env
0,Bareilly,Bareilly,25.0,01-Jan-2008,690.0,28.5,79.5,28.5,79.5
1,Bijnor,Bijnaur,2.5,01-Jan-2008,690.0,29.75,78.5,29.75,78.5
2,Faizabad,Faizabad,4.0,01-Jan-2008,400.0,26.75,82.0,26.75,82.0
3,Gorakhpur,Gorakhpur,70.0,01-Jan-2008,575.0,26.75,83.25,26.75,83.25
4,Lucknow,Lucknow,80.0,01-Jan-2008,450.0,26.75,81.0,26.75,81.0


In [126]:
df= df.drop(['latitude_env','longitude_env'], axis=1)
df_new = pd.merge(df, df_env, on=['Date','latitude','longitude'])
df_new.head()

Unnamed: 0,District,Market,Arrivals,Date,Modal Price,latitude,longitude,Soil_Type,Humidity,Sunlight,Temperature,Avg_Precipitation,Last_Precipitation,Day,Month_name,Year,Month
0,Bareilly,Bareilly,25.0,01-Jan-2008,690.0,28.5,79.5,1.999985,61.197508,12549770.0,289.933583,3.72529,3.72529,1,Jan,2008,1
1,Bijnor,Bijnaur,2.5,01-Jan-2008,690.0,29.75,78.5,2.99997,59.470442,12674970.0,291.476423,3.72529,3.72529,1,Jan,2008,1
2,Faizabad,Faizabad,4.0,01-Jan-2008,400.0,26.75,82.0,1.999985,46.434779,13117060.0,288.4865,3.72529,3.72529,1,Jan,2008,1
3,Gorakhpur,Gorakhpur,70.0,01-Jan-2008,575.0,26.75,83.25,1.999985,47.313848,13063140.0,289.419094,3.72529,3.72529,1,Jan,2008,1
4,Lucknow,Lucknow,80.0,01-Jan-2008,450.0,26.75,81.0,1.999985,50.346995,12956920.0,287.184343,3.72529,3.72529,1,Jan,2008,1


In [128]:
# df_new= df_new.drop(['Day_y', 'Month_name_y', 'Year_y', 'Month_y'], axis=1)

In [None]:
# df_new['Month'] = df_new['Month_x']
# df_new['Month_name'] = df_new['Month_name_x']
# df_new['Day'] = df_new['Day_x']
# df_new['Year'] = df_new['Year_x']

In [None]:
# df_new= df_new.drop(['Month_name_x', 'Day_x', 'Year_x', 'Month_x'], axis=1)

In [132]:
df_new.columns

Index(['District', 'Market', 'Arrivals', 'Date', 'Modal Price', 'latitude',
       'longitude', 'Soil_Type', 'Humidity', 'Sunlight', 'Temperature',
       'Avg_Precipitation', 'Last_Precipitation', 'Day', 'Month_name', 'Year',
       'Month'],
      dtype='object')

In [133]:
df_new.shape

(42774, 17)

In [135]:
df_new = df_new.sort_values(by = ['District','Market', 'Year','Month','Day'])
df_new = df_new.reset_index(drop=True)
df_new.head()

Unnamed: 0,District,Market,Arrivals,Date,Modal Price,latitude,longitude,Soil_Type,Humidity,Sunlight,Temperature,Avg_Precipitation,Last_Precipitation,Day,Month_name,Year,Month
0,Aligarh,Aligarh,38.0,26-Sep-2012,950.0,28.0,78.25,1.999985,66.869422,17240630.0,303.093921,3.72529,3.72529,26,Sep,2012,9
1,Aligarh,Aligarh,36.0,27-Sep-2012,1000.0,28.0,78.25,1.999985,71.013591,17034940.0,302.634643,3.72529,3.72529,27,Sep,2012,9
2,Aligarh,Aligarh,40.0,28-Sep-2012,1020.0,28.0,78.25,1.999985,71.500668,17098600.0,302.778941,3.72529,3.72529,28,Sep,2012,9
3,Aligarh,Aligarh,40.0,29-Sep-2012,1050.0,28.0,78.25,1.999985,71.251155,16863600.0,303.186585,3.72529,3.72529,29,Sep,2012,9
4,Aligarh,Aligarh,45.0,01-Oct-2012,1020.0,28.0,78.25,1.999985,65.606468,16917850.0,303.553394,3.72529,3.72529,1,Oct,2012,10


In [136]:
df_new.to_csv('Tomato/Tomato/tomato_234mandis_env_post_impute.csv')