# Temperature Predictions

https://www.hackerrank.com/challenges/temperature-predictions/problem

Given a record containing the maximum and minimum monthly temperatures at a particular station.

The record shows the temperature information for each month in a data range from January 1908 to March 2012.

**Problem:** Some of the temperature values in either t_max or t_min have been blanked out!

**Goal:** Estimate and print the missing values.

## Input Data

First row: N = number of samples

Second row: header

## Output Data

Only the missing values in chronological order.

## Display File Content

In [1]:
# training sample input
with open('Temperature_Predictions_input00.txt','r') as fh:
     data = fh.readlines()
data[:20]

['443\n',
 'yyyy\tmonth\ttmax\ttmin\n',
 '1908\tJanuary\t5.0\t-1.4\n',
 '1908\tFebruary\t7.3\t1.9\n',
 '1908\tMarch\t6.2\t0.3\n',
 '1908\tApril\tMissing_1\t2.1\n',
 '1908\tMay\tMissing_2\t7.7\n',
 '1908\tJune\t17.7\t8.7\n',
 '1908\tJuly\tMissing_3\t11.0\n',
 '1908\tAugust\t17.5\t9.7\n',
 '1908\tSeptember\t16.3\t8.4\n',
 '1908\tOctober\t14.6\t8.0\n',
 '1908\tNovember\t9.6\t3.4\n',
 '1908\tDecember\t5.8\tMissing_4\n',
 '1909\tJanuary\t5.0\t0.1\n',
 '1909\tFebruary\t5.5\t-0.3\n',
 '1909\tMarch\t5.6\t-0.3\n',
 '1909\tApril\t12.2\t3.3\n',
 '1909\tMay\t14.7\t4.8\n',
 '1909\tJune\t15.0\t7.5\n']

## Using data from txt

In [2]:
import numpy as np
import pandas as pd

# training sample input
with open('Temperature_Predictions_input00.txt','r') as fh:
     data = fh.readlines()

# EXTRACT data
i = -2
temperature = []
min_temp = []
max_temp = []
# the interpolation will happen with a complete data corpus, but we need the missing_dict to remember where data was interpolated
missing_dict = {}

for line in data:
    if i == -2:
        # read N from first line
        n_samples = int(line)
        i += 1
    elif i == -1:
        # skip header in second line
        i += 1
    else:
        # read temperature lines 
        temperature_list = line.split("\t")
        
        # Create min_temp list, missing to nan, make entry in missing_dict
        if "Missing" not in temperature_list[2]:
            min_temp.append(float(temperature_list[2]))
        else:
            min_temp.append(np.nan)
            # key = missing value running id, value = column name and row number
            missing_dict[int(temperature_list[2].replace("Missing_","").replace("\n",""))] = ['min',i]
            
        # Create max_temp list, missing to nan, make entry in missing_dict
        if "Missing" not in temperature_list[3]:
            max_temp.append(float(temperature_list[3].replace("\n","")))    
        else:
            max_temp.append(np.nan)
            # key = missing value running id, value = column name and row number
            missing_dict[int(temperature_list[3].replace("Missing_","").replace("\n",""))] = ['max',i]
        
        i += 1
        
# Create Pandas DataFrame
df = pd.DataFrame(zip(min_temp, max_temp))

# INTERPOLATE df columns and create new df. bfill/ffill deal with edge values
new_df = pd.DataFrame(zip(round(df[0].interpolate(axis=0, method='polynomial', order=3).ffill().bfill(), 1),
                          round(df[1].interpolate(axis=0, method='polynomial', order=3).ffill().bfill(), 1)),
                      columns=['min','max'])

# PRINT interpolated values from DataFrame using missing_dictg
for value in missing_dict.values():
    print(new_df[value[0]][value[1]])

8.7
13.6
18.6
0.7
7.2
1.8
6.3
3.9
11.4
7.3
1.2
8.6
16.3
18.8
2.8
5.6
10.8
8.1
13.1
12.9
4.4
19.5
10.7
7.8
-0.1
17.1
18.9
14.4
-2.5
20.4
0.3
12.6
9.5
16.1
15.9
14.9
6.0
0.5
3.9
10.6
17.1
3.3
3.3
1.4
3.2
20.5
2.7
4.2
16.4
11.9
16.7
10.3
10.1
4.7
-0.7
2.3
15.1
14.9
11.2
7.8
17.2
5.9
6.7
11.0
5.3
11.9
11.8
4.1
10.3
16.8
4.9
4.4
6.2
10.6
7.4
10.2
20.1
4.2
2.8
12.1
12.9
2.9
10.2
19.2
0.2
10.6
6.2
4.0
-1.8
6.8
5.5
13.1
13.3
-2.2
9.0
7.8
3.1
1.9
4.0
12.1


## Using stdin

In [3]:
import fileinput
import numpy as np
import pandas as pd


# EXTRACT data
i = -2
temperature = []
min_temp = []
max_temp = []
missing_dict = {}

for line in fileinput.input():
    if i == -2:
        # read N from first line
        n_samples = int(line)
        i += 1
    elif i == -1:
        # skip header in second line
        i += 1
    else:
        # read temperature lines 
        temperature_list = line.split("\t")
        
        # Create min_temp list, missing to nan, make entry in missing_dict
        if "Missing" not in temperature_list[2]:
            min_temp.append(float(temperature_list[2]))
        else:
            min_temp.append(np.nan)
            # key = missing value running id, value = column name and row number
            missing_dict[int(temperature_list[2].replace("Missing_","").replace("\n",""))] = ['min',i]
            
        # Create max_temp list, missing to nan, make entry in missing_dict
        if "Missing" not in temperature_list[3]:
            max_temp.append(float(temperature_list[3].replace("\n","")))    
        else:
            max_temp.append(np.nan)
            # key = missing value running id, value = column name and row number
            missing_dict[int(temperature_list[3].replace("Missing_","").replace("\n",""))] = ['max',i]
        
        i += 1
        
# Create Pandas DataFrame
df = pd.DataFrame(zip(min_temp, max_temp))

# INTERPOLATE df columns and create new df. bfill/ffill deal with edge values
new_df = pd.DataFrame(zip(round(df[0].interpolate(axis=0, method='polynomial', order=3).ffill().bfill(), 1),
                          round(df[1].interpolate(axis=0, method='polynomial', order=3).ffill().bfill(), 1)),
                      columns=['min','max'])

# PRINT interpolated values from DataFrame using missing_dict
for value in missing_dict.values():
    print(new_df[value[0]][value[1]])

FileNotFoundError: [Errno 2] No such file or directory: '-f'