In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
def text_to_word_list(text):
    ''' Pre process and convert texts to a list of words '''
    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r"\-", " ", text)
    text = re.sub(r"\=", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    return (text.strip())

def categorising(df, column):
    df[column] = df[column].astype("category").cat.codes
    return df[column]

def preprocess(df):
    for element in df:
        if df[element].dtype=="object":
            df[element] = df[element].str.lower()
    
    df['commodity'] = df['commodity'].apply(text_to_word_list)
    #for i in range(len(CMO_msp['commodity'])):
    #    df['commodity'][i] = text_to_word_list(df['commodity'][i])
        
    return df

In [3]:
CMO_msp_mandi = "../dataset/CMO_MSP_Mandi.csv"
monthly = "../dataset/Monthly_data_cmo.csv"

In [4]:
CMO_msp = pd.read_csv(CMO_msp_mandi)
CMO_msp['Type'] = categorising(CMO_msp, 'Type')
CMO_msp = preprocess(CMO_msp)
CMO_msp.to_csv('../dataset/clean_CMO.csv', sep='\t')

In [5]:
monthly_cmo = pd.read_csv(monthly)
monthly_cmo['APMC'] = categorising(monthly_cmo, 'APMC')
monthly_cmo['Month'] = categorising(monthly_cmo, 'Month')
monthly_cmo['district_name'] = categorising(monthly_cmo, 'district_name')
monthly_cmo['state_name'] = categorising(monthly_cmo, 'state_name')
monthly_cmo = monthly_cmo.rename(columns={'Commodity': 'commodity', 'Year': 'year'})
monthly_cmo = preprocess(monthly_cmo)
monthly_cmo['commodity'] = categorising(monthly_cmo, 'commodity')
monthly_cmo['date'] = monthly_cmo.date.str.split('-').str.get(1)
monthly_cmo['date'] = pd.to_numeric(monthly_cmo['date'], errors='coerce')
monthly_cmo = monthly_cmo.drop(['Month'],axis=1)
monthly_cmo.to_csv('../dataset/clean_monthly.csv', sep='\t')