In [2]:
## US Thanksgiving data analysis using python.
## Reading data from CSV using pandas' read_csv function with Latin-1 encoding
import pandas as pd

data = pd.read_csv("thanksgiving.csv", encoding="Latin-1")

In [3]:
print(data.head(4))

   RespondentID Do you celebrate Thanksgiving?  \
0    4337954960                            Yes   
1    4337951949                            Yes   
2    4337935621                            Yes   
3    4337933040                            Yes   

  What is typically the main dish at your Thanksgiving dinner?  \
0                                             Turkey             
1                                             Turkey             
2                                             Turkey             
3                                             Turkey             

  What is typically the main dish at your Thanksgiving dinner? - Other (please specify)  \
0                                                NaN                                      
1                                                NaN                                      
2                                                NaN                                      
3                                                NaN  

In [9]:
## Filtering out data that is not relevant
data["Do you celebrate Thanksgiving?"].value_counts()

Yes    980
No      78
Name: Do you celebrate Thanksgiving?, dtype: int64

In [10]:
data = data[data["Do you celebrate Thanksgiving?"] == "Yes"]

In [11]:
## Exploring the main dishes
data["What is typically the main dish at your Thanksgiving dinner?"].value_counts()

Turkey                    859
Other (please specify)     35
Ham/Pork                   29
Tofurkey                   20
Chicken                    12
Roast beef                 11
I don't know                5
Turducken                   3
Name: What is typically the main dish at your Thanksgiving dinner?, dtype: int64

In [15]:
data[data["What is typically the main dish at your Thanksgiving dinner?"] == "Tofurkey"]["Do you typically have gravy?"]

4      Yes
33     Yes
69      No
72      No
77     Yes
145    Yes
175    Yes
218     No
243    Yes
275     No
393    Yes
399    Yes
571    Yes
594    Yes
628     No
774     No
820     No
837    Yes
860     No
953    Yes
Name: Do you typically have gravy?, dtype: object

In [17]:
## Finding out how many people ate pies in their Thanksgiving Dinner
apple_isnull = pd.isnull(data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Apple"])
pumpkin_isnull = pd.isnull(data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pumpkin"])
pecan_isnull = pd.isnull(data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pecan"])
ate_pies = apple_isnull & pumpkin_isnull & pecan_isnull
ate_pies.value_counts()

False    876
True     104
dtype: int64

In [20]:
## Checking the age analysis of the respondents
def str_to_int(string):
    if pd.isnull(string) == True:
        return None
    else:
        string = string.split(" ")
        string1 = string[0]
        string1 = string1.replace("+"," ")
        return int(string1)

data["int_age"] = data["Age"].apply(str_to_int)
data["int_age"].describe()



count    947.000000
mean      40.089757
std       15.352014
min       18.000000
25%             NaN
50%             NaN
75%             NaN
max       60.000000
Name: int_age, dtype: float64

#### Due to our assumption of taking lower value of every age range, 
#### the data is skewed to lower portion than usual

In [23]:
##Analysing the income data of the respondents
def get_sal(string):
    if pd.isnull(string) == True:
        return None
    string = string.split(" ")
    if string[0] == "Prefer":
        return None
    string1 = string[0]
    string1 = string1.replace(",","")
    string1 = string1.replace("$","")
    return int(string1)

data["int_income"] = data["How much total combined money did all members of your HOUSEHOLD earn last year?"].apply(get_sal)
data["int_income"].describe()



count       829.000000
mean      75965.018094
std       59068.636748
min           0.000000
25%                NaN
50%                NaN
75%                NaN
max      200000.000000
Name: int_income, dtype: float64

#### Again, we have taken a lower value for the income ranges,
#### leading to a downward skew of the data.

#### Measuring correlation between income and travelling for Thanksgiving
#### Less Income -> Younger people -> Travel to home

In [25]:
sal_data = data[data["int_income"] < 50000]
dist_data = sal_data["How far will you travel for Thanksgiving?"]
dist_data.value_counts()

Thanksgiving is happening at my home--I won't travel at all                         106
Thanksgiving is local--it will take place in the town I live in                      92
Thanksgiving is out of town but not too far--it's a drive of a few hours or less     64
Thanksgiving is out of town and far away--I have to drive several hours or fly       16
Name: How far will you travel for Thanksgiving?, dtype: int64

#### People with lower income (mostly younger people) tend to go home for Thanksgiving. 
#### Whereas people with higher income tend to have it at home. 

#### Linking friendship and Age
#### Usually younger people have Thanksgiving with friends or Friendsgiving
#### And usually these people also have lower income.
#### Based on average age of respondents.

In [27]:
data.pivot_table(
    index="Have you ever tried to meet up with hometown friends on Thanksgiving night?",
    columns = 'Have you ever attended a "Friendsgiving?"',
    values = "int_age"
)

"Have you ever attended a ""Friendsgiving?""",No,Yes
Have you ever tried to meet up with hometown friends on Thanksgiving night?,Unnamed: 1_level_1,Unnamed: 2_level_1
No,42.283702,37.010526
Yes,41.47541,33.976744


In [28]:
## Based on average income of respondents
data.pivot_table(
    index="Have you ever tried to meet up with hometown friends on Thanksgiving night?",
    columns = 'Have you ever attended a "Friendsgiving?"',
    values = "int_income"
)

"Have you ever attended a ""Friendsgiving?""",No,Yes
Have you ever tried to meet up with hometown friends on Thanksgiving night?,Unnamed: 1_level_1,Unnamed: 2_level_1
No,78914.549654,72894.736842
Yes,78750.0,66019.736842


#### Thus we can see that average age of people attending Friendsgiving is 33-37
#### Avg age of people meeting hometown friends is 33-41
#### Avg income of people attending Friendsgiving is 66K-72K
#### Avg income of people visiting hometown friends is around 78000