In [112]:
import pickle
import pandas as pd
from geopy import distance
pd.set_option('display.max_columns', None)
# useful functions and classes

# This class stores the latitude and longitude of a sample, and indicates 
# if this location has the desired variable we are estimating
class Location:
    def __init__(self,latitude,longitude,hasv,ID,value):
        self.ID = ID
        self.latitude = latitude
        self.longitude = longitude
        self.hasv = hasv
        self.value = value
        
    def __str__(self):
        return str(self.ID)

# Calculates the distance between 2 samples in km
def getdist(S1,S2):
    # radius of earth in km
    coords_1 = (S1.latitude, S1.longitude)
    coords_2 = (S2.latitude, S2.longitude)
    dist = distance.distance(coords_1, coords_2).km
    return dist

# filters out data if a point is missing in one of the colunns
def filterblanks(columns,data,blank):
    # if blank is true, rows with blanks in these columns
    # if blank is false, remove rows with non blanks or non zeros in these columns
    for c in columns:
        if blank:
            data = data[data[c].notnull()]
        else:
            data = data[data[c].isnull() | (data[c]==0)]
    return data

# PRE: all locations in the dataframe are
# unique
def DistanceMatrix(dataframe,variable):
    # the list of location objects
    locations = []
    # the list of indexes where the the row is located in the dataframe
    #indexes = []
    for index,row in dataframe.iterrows():
        # make a location object on this row
        if pd.isnull(row[variable]):
            hasv = False
        else:
            hasv = True
        locations.append(Location(row["LATITUDE"],row["LONGITUDE"],hasv,row["LOCATCD"],row[variable]))
        #indexes.append(index)
        
    matrix = pd.DataFrame(0,index=locations,columns=locations)
    for ci,column in enumerate(locations):
        for ri,row in enumerate(locations):
            if ri>ci:
                # compute distance between column and row
                dist = getdist(row,column)
            elif ci>ri:
                dist = matrix.iloc[ci,ri]
            # put this distance in the dataframe
            else:
                continue
            matrix.iloc[ri,ci] = dist
    return matrix

def changeVar(DM,data,variable):
    locations = DM.index
    # loop through each location
    for loc in location:
        ID = loc.ID
        row = data[data["LOCATCD"]==ID]
        if pd.isnull(row[variable]):
            loc.hasv = False
            loc.value = None
        else:
            loc.hasv = True
            loc.value = row[variable]
            
    DM.index = locations
    DM.columns = locations
        
def getclosest(numclosest,distancematrix,location):
    # Make a set of the closest locations that contain variable
    closest = {}
    column = distancematrix.loc[:,location].copy()
    print(type(distancematrix.index[0]))
    # Filter the locations that dont have the desired variable
    doesnthavev = []
    for i in range(len(column)-1):
        if not column.index[i].hasv:
            doesnthavev.append(column.index[i])
    column.drop(doesnthavev,inplace = True)
    
    #print(type(column))
    column.sort_values(inplace = True)
    #print(column.iloc[0])

    column.iloc[0] = 0
    #print(column.iloc[0])
    if column.iloc[0]==0:
        column = column.iloc[1:]
    return column.iloc[0:numclosest]

# Key: Location Code
# Value: List of tuples (locatcd,distance,value)
def makeDict(data,variable,numclosest=2):
    D = DistanceMatrix(data,variable)
    # Loop through each location without a value for variable
    closestDict = {}
    for loc in D.columns:
        if not loc.hasv:
            # Get the closest locations to loc
            closest = getclosest(numclosest,D,loc)
            # The list of tuples that contain location id, the distance, and the value for variable
            tuples = []
            for i,dist in enumerate(closest):
                ID = closest.index[i].ID
                val = closest.index[i].value
                tuples.append((ID,dist,val))
            closestDict[loc.ID] = tuples
    return closestDict

def predict(tuples,numclosest = 2):
    loc2 = tuples[0]
    loc3 = tuples[1]
    d12 = loc2[1]
    val2 = loc2[2]
    d13 = loc3[1]
    val3 = loc3[2]
    
    c2 = d12/(d12+d13)
    c3 = d13/(d12+d13)
    
    predicted = c2*val2+c3*val3
    
    return predicted

# NEEDS WORK
def addpredictions(df,variables,numclosest):
    # make prediction and insert for each variable
    first = True
    for var in variables:
        if first:
            DM = DistanceMatrix(df,var)
        else:
            changeVar(DM,df,var)
            
        for loc in DM.columns:
            if not loc.hasv:
                # Get the closest locations to loc
                closest = getclosest(numclosest,DM,loc)
                # The list of tuples that contain location id, the distance, and the value for variable
                tuples = []
                for i,dist in enumerate(closest):
                    ID = closest.index[i].ID
                    val = closest.index[i].value
                    tuples.append((ID,dist,val))
                closestDict[loc.ID] = tuples
                

In [115]:
data = pickle.load(open( "water_data_coords.p", "rb" ))
path = r"C:\Users\cashe\OneDrive\Desktop\Data Science\Mississippi River analysis\Krouth_water_and_veg_data_w_latlong\ltrm_water_data_lat_long.csv"
doug_data = pd.read_csv(path,low_memory=False)

In [116]:
data.drop(data.columns.difference(['TN','TP','TPQF','TNQF','SS','SSQF',
                                         'TURB','TURBQF','WDP',
                                         'TEMP','TEMPQF','DO','DOQF','COND',
                                         'CONDQF','VEL','VELQF','FLDEAST',
                                         'FLDNORTH','PROJCD','FLDNUM','DATE',
                                  'LOCATCD','STRATUM','CHLcal','SECCHI','SECCHIQF','LATITUDE','LONGITUDE']), 1, inplace=True)
print("After filtering columns: ",data.shape)
print("Now filtering sampling design")
data = data[(data.PROJCD == "M-")]
print("After filtering sampling design: ",data.shape)
print("Now filtering Pool 13")
data = data[(data.FLDNUM == 3)]
print("After filtering Pool 13: ",data.shape)
print("Now adding a year column")
data["YEAR"] = pd.DatetimeIndex(data["DATE"]).year
print(data.shape)
print("Adding a timecode column")
data["TIME CODE"] = data["LOCATCD"].astype(str).apply(lambda x: x[3])
print(data.shape)
print("Filtering by year 1997")
data = data[data["YEAR"]==1997]
print(data.shape)
print("Filtering by summer")
data = data[data["TIME CODE"] == '2']
print(data.shape)
#print("Dropping 'bad' data")
#QFcols = ['TPQF','TNQF','SSQF','TURBQF','TEMPQF','DOQF','CONDQF','VELQF','SECCHIQF']
#qualdata = filterblanks(QFcols,data,False)
#print(qualdata.shape)
#print("Dropping all blank columns")
#qualdata.drop(['PROJCD','FLDEAST','FLDNORTH','TPQF','TNQF','SSQF','TURBQF','TEMPQF','DOQF',
#                                         'CONDQF','VELQF','SECCHIQF'], 1, inplace=True)
#print(qualdata.shape)
print("Filtering out points with blank entries in at least one of the columns")
cols = ['TN','TP','TEMP','DO','TURB','COND','VEL','SS','WDP','CHLcal','SECCHI']
#for col in cols:
#    print(col,data[col].isna().sum())
qualdata_noprediction = filterblanks(cols,data,True)
print(qualdata_noprediction.shape)

After filtering columns:  (106052, 29)
Now filtering sampling design
After filtering sampling design:  (106052, 29)
Now filtering Pool 13
After filtering Pool 13:  (17991, 29)
Now adding a year column
(17991, 30)
Adding a timecode column
(17991, 31)
Filtering by year 1997
(613, 31)
Filtering by summer
(151, 31)
Filtering out points with blank entries in at least one of the columns
(64, 31)


In [97]:
data[data["SECCHIQF"].isnull()&data["TEMPQF"].isnull()&data["DOQF"].isnull()&data["TURBQF"].isnull()&data["CONDQF"].isnull()
    &data["VELQF"].isnull()&data["TPQF"].isnull()&data["TNQF"].isnull()&(data["SSQF"]==0)].shape

(52, 31)

In [98]:
data.shape

(151, 31)

In [100]:
data[(data["SECCHIQF"]==0)|(data["TEMPQF"]==0)|(data["DOQF"]==0)|(data["TURBQF"]==0)|(data["CONDQF"]==0)|
    (data["VELQF"]==0)|(data["TPQF"]==0)|(data["TNQF"]==0)].shape

(81, 31)

In [108]:
DictTN = makeDict(data,"TN")
DictTP = makeDict(data,"TP")

<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Location'>
<class '__main__.Loc

In [117]:
#put in predicted TN
data["PredictedTN"] = 0
for index,row in data.iterrows():
    if pd.isnull(row["TN"]):
        data.loc[index,"PredictedTN"] = predict(DictTN[row["LOCATCD"]])
    else:
        data.loc[index,"PredictedTN"] = row["TN"]

data["PredictedTP"] = 0
for index,row in data.iterrows():
    if pd.isnull(row["TP"]):
        data.loc[index,"PredictedTP"] = predict(DictTP[row["LOCATCD"]])
    else:
        data.loc[index,"PredictedTP"] = row["TP"]      

In [110]:
data.shape

(151, 33)

In [118]:
print("Filtering out points with blank entries in at least one of the columns")
cols = ['PredictedTN','PredictedTP','TEMP','DO','TURB','COND','VEL','SS','WDP','CHLcal','SECCHI']
#for col in cols:
#    print(col,data[col].isna().sum())
qualdata_prediction = filterblanks(cols,data,True)
print(qualdata_prediction.shape)

Filtering out points with blank entries in at least one of the columns
(120, 33)


In [51]:
data["PredictedTP"].describe()

count    151.000000
mean       0.265133
std        0.248787
min        0.015000
25%        0.167185
50%        0.185000
75%        0.209663
max        1.355000
Name: PredictedTP, dtype: float64

In [52]:
data["TP"].describe()

count    81.000000
mean      0.246494
std       0.220238
min       0.015000
25%       0.167000
50%       0.185000
75%       0.203000
max       1.355000
Name: TP, dtype: float64

In [53]:
data["PredictedTN"].describe()

count    151.000000
mean       2.195319
std        0.373832
min        0.899000
25%        2.013365
50%        2.282123
75%        2.449000
max        2.736000
Name: PredictedTN, dtype: float64

In [54]:
data["TN"].describe()

count    81.000000
mean      2.214444
std       0.411976
min       0.899000
25%       2.129000
50%       2.315000
75%       2.477000
max       2.736000
Name: TN, dtype: float64

In [57]:
data.isna().sum()

FLDNUM           0
DATE             0
PROJCD           0
LOCATCD          0
WDP              1
SECCHI           1
SECCHIQF       139
STRATUM          0
FLDEAST        151
FLDNORTH       151
TEMP             1
TEMPQF         150
DO               1
DOQF           150
TURB             1
TURBQF         150
COND             1
CONDQF         150
VEL             31
VELQF          120
TP              70
TPQF            70
TN              70
TNQF            70
SS               1
SSQF             1
CHLcal           1
LATITUDE         0
LONGITUDE        0
YEAR             0
TIME CODE        0
PredictedTN      0
PredictedTP      0
dtype: int64

In [62]:
pickle.dump(data, open( "summer_1997.p", "wb" ) )

In [13]:
matrix = DistanceMatrix(data,"TN")

In [14]:
array = matrix.to_numpy()

In [15]:
def transpose(mat, tr, N): 
    for i in range(N): 
        for j in range(N): 
            tr[i][j] = mat[j][i] 
   
# Returns true if mat[N][N] is symmetric, else false 
def isSymmetric(mat, N): 
      
    tr = [ [0 for j in range(len(mat[0])) ] for i in range(len(mat)) ] 
    transpose(mat, tr, N) 
    for i in range(N): 
        for j in range(N): 
            if (mat[i][j] != tr[i][j]): 
                return False
    return True
   
# Driver code 
if (isSymmetric(array, 151)): 
    print ("Yes")
else: 
    print ("No")

Yes


(151, 151)

In [7]:
closest = getclosest(3,matrix,matrix.columns[58])

<class '__main__.Location'>
0.0


9732061     0.000000
9732060     1.097430
9732062     1.612008
9732008     2.011911
9732025     3.130428
             ...    
9732050    20.539670
9732022    20.800828
9732064    21.923900
9732001    22.458735
9732021    25.755405
Name: 9732061, Length: 81, dtype: float64

0.0


9732060     1.097430
9732062     1.612008
9732008     2.011911
9732025     3.130428
9732037     3.286177
             ...    
9732050    20.539670
9732022    20.800828
9732064    21.923900
9732001    22.458735
9732021    25.755405
Name: 9732061, Length: 80, dtype: float64

In [38]:
for i,dist in enumerate(closest):
    print(closest.index[i].value,dist)

1.075 1.0974304557353052
1.7619999999999998 1.612008115880842
2.51 2.0119109371293327


In [39]:
closest

9732060    1.097430
9732062    1.612008
9732008    2.011911
Name: 9732061, dtype: float64

In [96]:
data = pd.DataFrame({'Brand' : ['Maruti', 'Hyundai', 'Tata', 
                                'Mahindra', 'Maruti', 'Hyundai', 
                                'Renault', 'Tata', 'Maruti'], 
                     'Year' : [2012, 2014, 2011, 2015, 2012,  
                               2016, 2014, 2018, 2019], 
                     'Kms Driven' : [50000, 30000, 60000,  
                                     25000, 10000, 46000,  
                                     31000, 15000, 12000], 
                     'City' : ['Gurgaon', 'Delhi', 'Mumbai',  
                               'Delhi', 'Mumbai', 'Delhi',  
                               'Mumbai','Chennai',  'Ghaziabad'], 
                     'Mileage' :  [28, 27, 25, 26, 28,  
                                   29, 24, 21, 24]}) 

In [102]:
s = data.loc[:,"Brand"]
s[s.index==4]

4    Maruti
Name: Brand, dtype: object

In [54]:
df = pd.DataFrame(0,
     index=['cobra', 'viper', 'sidewinder'],
     columns=['max_speed', 'shield'])

In [55]:
df

Unnamed: 0,max_speed,shield
cobra,0,0
viper,0,0
sidewinder,0,0


In [107]:
range(1,5)

range(1, 5)

In [24]:
d = {"Loc1":[('LOC1','dist','val'),('LOC2','dist','val')]}

In [25]:
nest = d["Loc1"]

In [20]:
f = 'hey'
g = 'loo'
g+f

'loohey'