In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [2]:
state_abrv = pd.read_csv('States.csv')
state_area = pd.read_csv('State-areas.csv')
state_pop  = pd.read_csv('State-pop.csv')

In [3]:
state_df = pd.DataFrame()
state_df['name']=state_abrv['state']
state_df['abbreviation']=state_abrv['abbreviation']

In [4]:
state_df = state_df.join(state_area.set_index('state') , on='name')

In [5]:
state_df = state_df.join(state_pop.set_index('state/region'),on='abbreviation')

In [11]:
state_df.head

<bound method NDFrame.head of        name abbreviation  area (sq. mi)     ages  year  population
0   Alabama           AL          52423  under18  2012   1117489.0
0   Alabama           AL          52423    total  2012   4817528.0
0   Alabama           AL          52423  under18  2010   1130966.0
0   Alabama           AL          52423    total  2010   4785570.0
0   Alabama           AL          52423  under18  2011   1125763.0
..      ...          ...            ...      ...   ...         ...
50  Wyoming           WY          97818  under18  1993    137458.0
50  Wyoming           WY          97818    total  1991    459260.0
50  Wyoming           WY          97818  under18  1991    136720.0
50  Wyoming           WY          97818  under18  1990    136078.0
50  Wyoming           WY          97818    total  1990    453690.0

[2448 rows x 6 columns]>

In [8]:
state_df.shape

(2448, 6)

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [17]:
total_population = state_df[state_df['ages']=='total'][['name','year','population']]
total_population

Unnamed: 0,name,year,population
0,Alabama,2012,4817528.0
0,Alabama,2010,4785570.0
0,Alabama,2011,4801627.0
0,Alabama,2009,4757938.0
0,Alabama,2013,4833722.0
...,...,...,...
50,Wyoming,1994,480283.0
50,Wyoming,1992,466251.0
50,Wyoming,1993,473081.0
50,Wyoming,1991,459260.0


In [38]:
total_population = state_df[state_df['ages'] == 'total'][['name', 'year', 'population']]
total_population_grouped = total_population.groupby('name')
state_wise = {}

for state, data in total_population_grouped:
    state_wise[state] = data[['year', 'population']]

In [39]:
state_wise['Alabama']

Unnamed: 0,year,population
0,2012,4817528.0
0,2010,4785570.0
0,2011,4801627.0
0,2009,4757938.0
0,2013,4833722.0
0,2007,4672840.0
0,2008,4718206.0
0,2005,4569805.0
0,2006,4628981.0
0,2004,4530729.0


In [26]:
model = LinearRegression()

In [42]:
x = state_wise['Alabama']['year'].values.reshape(-1, 1)
y = state_wise['Alabama']['population'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [43]:
model.fit(x_train, y_train)

In [44]:
score = model.score(x_test, y_test)
print("Model Score:", score)

Model Score: 0.9846996675588789


In [56]:
predicted_values = list(zip(x_test, prediction, y_test))
sorted_predictions = sorted(predicted_values, key=lambda x: x[0])
for year, pred_population, actual_population in sorted_predictions:
    print("Year:", year[0], "  Predicted :", f'{pred_population:.2f}', "  Actual :", actual_population)


Year: 1995   Predicted : 4264431.29   Actual : 4296800.0
Year: 1997   Predicted : 4331524.57   Actual : 4367935.0
Year: 2001   Predicted : 4465711.11   Actual : 4467634.0
Year: 2006   Predicted : 4633444.30   Actual : 4628981.0
Year: 2012   Predicted : 4834724.12   Actual : 4817528.0


In [60]:
tp = state_df[(state_df['ages']=='total')&(state_df['year']==2000)][['name','population']]
t_u18 = state_df[(state_df['ages']=='under18')&(state_df['year']==2000)][['name','population']]
data = pd.DataFrame(columns=['name','total pop','under 18'])
data = pd.merge(tp, t_u18, on='name', suffixes=('_total', '_under18'))
data.rename(columns={'population_total': 'total pop', 'population_under18': 'under 18'}, inplace=True)

In [61]:
data

Unnamed: 0,name,total pop,under 18
0,Alabama,4452173.0,1122273.0
1,Alaska,627963.0,190615.0
2,Arizona,5160586.0,1373414.0
3,Arkansas,2678588.0,680378.0
4,California,33987977.0,9267089.0
5,Colorado,4326921.0,1106676.0
6,Connecticut,3411777.0,842242.0
7,Delaware,786373.0,194914.0
8,District of Columbia,572046.0,114503.0
9,Florida,16047515.0,3654880.0


In [70]:
x=data['total pop'].values.reshape(-1, 1)
y=data['under 18']

In [71]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [72]:
u18_model=LinearRegression()

In [73]:
u18_model.fit(x_train, y_train)

In [74]:
score = u18_model.score(x_test, y_test)
print("Model Score:", score)

Model Score: 0.9901621338249843


In [85]:
predicted=u18_model.predict(x_test)
results = list(zip(x_test,predicted,y_test)) 
for total,pred,u18 in results:
    print(f'Total Population : {total[0]}   Predicted U18 : {pred}   Actual U18 : {u18}\n')

Total Population : 20944499.0   Predicted U18 : 5425098.334886913   Actual U18 : 5906301.0

Total Population : 4024223.0   Predicted U18 : 1018305.24410688   Actual U18 : 1010641.0

Total Population : 7105817.0   Predicted U18 : 1820889.5014670559   Actual U18 : 1741420.0

Total Population : 1299430.0   Predicted U18 : 308647.85044463724   Actual U18 : 370430.0

Total Population : 8430621.0   Predicted U18 : 2165927.437836282   Actual U18 : 2088885.0

Total Population : 3429708.0   Predicted U18 : 863467.0727694213   Actual U18 : 847511.0

Total Population : 4049021.0   Predicted U18 : 1024763.7472107988   Actual U18 : 994984.0

Total Population : 5311034.0   Predicted U18 : 1353448.111985616   Actual U18 : 1356961.0

Total Population : 2678588.0   Predicted U18 : 667841.9880280909   Actual U18 : 680378.0

Total Population : 3454365.0   Predicted U18 : 869888.8531966059   Actual U18 : 891847.0

Total Population : 12434161.0   Predicted U18 : 3208627.474801488   Actual U18 : 3244944.0



In [88]:
pd.set_option('display.float_format', lambda x: '%.0f' % x)
x_test_values = [x[0] for x in x_test]
df = pd.DataFrame({'Total Population': x_test_values,
                   'Predicted U18': predicted,
                   'Actual U18': y_test})
print(df)

    Total Population  Predicted U18  Actual U18
43          20944499        5425098     5906301
40           4024223        1018305     1010641
46           7105817        1820890     1741420
12           1299430         308648      370430
24           8430621        2165927     2088885
31           3429708         863467      847511
17           4049021        1024764      994984
32           5311034        1353448     1356961
3            2678588         667842      680378
30           3454365         869889      891847
13          12434161        3208627     3244944
