# Comparing Beer Company Followers by Tokens and Locations
**By:** _Mike Scheibel_  

In [1]:
import random
import nltk
import numpy as np
import datetime
import tweepy
import re 
from string import punctuation
from collections import Counter
from pprint import pprint
from nltk.corpus import stopwords

sw = set(stopwords.words('english'))

**Going to clean and tokenize the text files for each beer company**

Cleaning text file for __Amstell Light__ followers and tokenizing

In [2]:
amstell = open("AmstelLight_followers.txt", encoding='utf-8').read() # reading in text file

amstell_clean = [w for w in amstell.lower().split()] # cast words to lowercase

amstell_clean = [w.lower() for w in amstell_clean if w.isalpha() and w not in sw] # remove
#stopwords and make sure words are just letters

In [44]:
total_tokens = len(amstell_clean)
unique_tokens = len(set(amstell_clean))
lex_diversity = len(set(amstell_clean))/len(amstell_clean)
avg_token_len = np.mean([len(w) for w in amstell_clean])
top_10 = Counter(amstell_clean).most_common(10)

results = {'tokens':total_tokens,
            'unique_tokens':unique_tokens,
            'avg_token_length':avg_token_len,
            'lexical_diversity':lex_diversity,
            'top_10':top_10}

results

{'tokens': 11696,
 'unique_tokens': 5686,
 'avg_token_length': 5.992305061559508,
 'lexical_diversity': 0.48614911080711354,
 'top_10': [('usa', 107),
  ('new', 83),
  ('golf', 82),
  ('ny', 78),
  ('love', 71),
  ('sports', 66),
  ('beer', 59),
  ('fl', 46),
  ('fan', 45),
  ('south', 41)]}

In [4]:
import pandas as pd #Using pandas to convert text file to dataframe so I can look up location

amstell_df = pd.read_csv("AmstelLight_followers.txt", encoding='utf-8' ,delimiter="\t") #Using tab delimited 
#amstell_df

amstell_df['location'].value_counts()[:20].sort_values(ascending=False) #Counting up each location from dataframe, 
#sorting in descending order 

United States       22
New York, NY        19
Chicago, IL         18
Florida, USA        12
Boston, MA          10
USA                  9
Chicago              8
Atlanta, GA          8
Michigan             7
Ohio, USA            7
California, USA      6
Miami, FL            6
Indiana, USA         6
Charlotte, NC        6
New York             6
Indianapolis, IN     5
Illinois, USA        5
Philadelphia, PA     5
Brooklyn, NY         5
Pittsburgh, PA       5
Name: location, dtype: int64

**Amstell Light Token Analysis**

Amstell light has one of the higher token length (5.99) and lexical diversity(0.49) in followers descriptions. New York seems like a popular location for this beer, with 19 followers and also NY being the fourth most common work in descritpions. It is also interesting that "golf" is one of the more common words in descriptions. Amestell light must be popular among golfers. I also notice that "south" is one of the top words and Florida, Atlanta, and Miami are some most common states of Amstell light followers. 

Cleaning text file for __Bud Light__ followers and tokenizing

In [5]:
bud = open("budlight_followers.txt", encoding='utf-8').read() # reading in text file

bud_clean = [w for w in bud.lower().split()] # cast words to lowercase

bud_clean = [w.lower() for w in bud_clean if w.isalpha() and w not in sw] # remove
#stopwords and make sure words are just letters

In [6]:
total_tokens = len(bud_clean)
unique_tokens = len(set(bud_clean))
lex_diversity = len(set(bud_clean))/len(bud_clean)
avg_token_len = np.mean([len(w) for w in bud_clean])
top_10 = Counter(bud_clean).most_common(10)

results = {'tokens':total_tokens,
            'unique_tokens':unique_tokens,
            'avg_token_length':avg_token_len,
            'lexical_diversity':lex_diversity,
            'top_10':top_10}

results

{'tokens': 29109,
 'unique_tokens': 11924,
 'avg_token_length': 5.870967741935484,
 'lexical_diversity': 0.4096327596276066,
 'top_20': [('usa', 424),
  ('love', 259),
  ('united', 130),
  ('new', 129),
  ('ca', 120),
  ('life', 109),
  ('tx', 105),
  ('like', 102),
  ('fl', 101),
  ('fan', 98)]}

In [7]:
bud_df = pd.read_csv("budlight_followers.txt", encoding='utf-8' ,delimiter="\t")
#bud_df

bud_df['location'].value_counts()[:20].sort_values(ascending=False)

United States          73
California, USA        38
Chicago, IL            25
Texas, USA             22
Los Angeles, CA        18
Ohio, USA              17
USA                    16
Florida, USA           14
Michigan, USA          13
Tampa, FL              13
Virginia, USA          11
New York, USA          11
Georgia, USA           10
Houston, TX            10
Missouri, USA          10
Orlando, FL            10
North Carolina, USA    10
Canada                 10
Atlanta, GA             9
Iowa, USA               9
Name: location, dtype: int64

**Bud Light Token Analysis**

It seems that Bud Light is more popular in the midwest and in California, and less so in east coast states. Florida however has a decent amount of followers of Bud light. The user descriptions are pretty middle of the road, and not much sticks out. 

Cleaning text file for __Coors Light__ followers and tokenizing

In [8]:
coors = open("CoorsLight_followers.txt", encoding='utf-8').read() # reading in text file

coors_clean = [w for w in coors.lower().split()] # cast words to lowercase

coors_clean = [w.lower() for w in coors_clean if w.isalpha() and w not in sw] # remove
#stopwords and make sure words are just letters

In [9]:
total_tokens = len(coors_clean)
unique_tokens = len(set(coors_clean))
lex_diversity = len(set(coors_clean))/len(coors_clean)
avg_token_len = np.mean([len(w) for w in coors_clean])
top_10 = Counter(coors_clean).most_common(10)

results = {'tokens':total_tokens,
            'unique_tokens':unique_tokens,
            'avg_token_length':avg_token_len,
            'lexical_diversity':lex_diversity,
            'top_10':top_10}

results

{'tokens': 29257,
 'unique_tokens': 11881,
 'avg_token_length': 5.77560925590457,
 'lexical_diversity': 0.4060908500529788,
 'top_20': [('usa', 427),
  ('love', 243),
  ('tx', 179),
  ('ca', 158),
  ('life', 136),
  ('like', 117),
  ('united', 115),
  ('fan', 114),
  ('new', 108),
  ('sports', 106)]}

In [10]:
# Had an error on one line in this text file so had to skip that one line when converting to data fram
coors_df = pd.read_csv("CoorsLight_followers.txt", encoding='utf-8', delimiter="\t", error_bad_lines=False) 

# error_bad_lines=False skipped line 3679 which expedted 7 fields, saw 8

# remove # from coors_df to see full dataframe
#coors_df 

coors_df['location'].value_counts()[:20].sort_values(ascending=False)

b'Skipping line 3067: expected 7 fields, saw 8\nSkipping line 3679: expected 7 fields, saw 8\n'


United States          77
Texas, USA             31
California, USA        27
USA                    27
Los Angeles, CA        20
Chicago, IL            19
Ohio, USA              18
Michigan, USA          16
North Carolina, USA    16
Kansas City, MO        15
Tennessee, USA         15
San Diego, CA          14
Houston, TX            13
Omaha, NE              12
Florida, USA           12
San Antonio, TX        12
Las Vegas, NV          12
Indiana, USA           11
Colorado, USA          11
New York, NY           11
Name: location, dtype: int64

**Coors Light Token Analysis**

Coors Light follower descriptions had identical lexical diversity(0.41) to Bud Light, Natural Light and PBR. And like Bud Light, Coors Light has most of its followers living in the Midwest. The one surprise however was that there was only 11 followers that lived in Colorado, which is where Coors Light is brewed. 

Cleaning text file for __Dos Equis__ followers and tokenizing

In [11]:
dos_equis = open("DosEquis_followers.txt", encoding='utf-8').read() # reading in text file

dos_equis_clean = [w for w in dos_equis.lower().split()] # cast words to lowercase

dos_equis_clean = [w.lower() for w in dos_equis_clean if w.isalpha() and w not in sw] # remove
#stopwords and make sure words are just letters

In [12]:
total_tokens = len(dos_equis_clean)
unique_tokens = len(set(dos_equis_clean))
lex_diversity = len(set(dos_equis_clean))/len(dos_equis_clean)
avg_token_len = np.mean([len(w) for w in dos_equis_clean])
top_10 = Counter(dos_equis_clean).most_common(10)

results = {'tokens':total_tokens,
            'unique_tokens':unique_tokens,
            'avg_token_length':avg_token_len,
            'lexical_diversity':lex_diversity,
            'top_10':top_10}

results

{'tokens': 32152,
 'unique_tokens': 12761,
 'avg_token_length': 5.790837272953471,
 'lexical_diversity': 0.3968959940283653,
 'top_20': [('usa', 489),
  ('tx', 449),
  ('love', 219),
  ('de', 195),
  ('new', 182),
  ('ca', 170),
  ('texas', 152),
  ('life', 142),
  ('la', 139),
  ('san', 120)]}

In [13]:
dos_equis_df = pd.read_csv("DosEquis_followers.txt", encoding='utf-8' ,delimiter="\t", error_bad_lines=False)
#dos_equis_df

dos_equis_df['location'].value_counts()[:20].sort_values(ascending=False)

b'Skipping line 4958: expected 7 fields, saw 8\n'


Texas, USA         98
United States      96
Houston, TX        60
Chicago, IL        45
Los Angeles, CA    44
San Antonio, TX    41
Dallas, TX         34
California, USA    33
USA                31
Austin, TX         31
Texas              31
Phoenix, AZ        27
Florida, USA       26
New York, NY       23
Las Vegas, NV      22
México             20
Ohio, USA          19
Mexico             19
New Jersey, USA    17
New York, USA      16
Name: location, dtype: int64

**Dos Equis Token Analysis**

It seems like not surprise that most of Dos Equis followers are in Mexico and surrounding states of Texas, Arizona and California. Dos Equis has an overwhelming amount of followers who live in Texas. Mexico also cracks the top location with 20 followers. 

Cleaning text file for __Guinness__ followers and tokenizing

In [14]:
guinness = open("GuinnessGB_followers.txt", encoding='utf-8').read() # reading in text file

guinness_clean = [w for w in guinness.lower().split()] # cast words to lowercase

guinness_clean = [w.lower() for w in guinness_clean if w.isalpha() and w not in sw] # remove
#stopwords and make sure words are just letters

In [15]:
total_tokens = len(guinness_clean)
unique_tokens = len(set(guinness_clean))
lex_diversity = len(set(guinness_clean))/len(guinness_clean)
avg_token_len = np.mean([len(w) for w in guinness_clean])
top_10 = Counter(guinness_clean).most_common(10)

results = {'tokens':total_tokens,
            'unique_tokens':unique_tokens,
            'avg_token_length':avg_token_len,
            'lexical_diversity':lex_diversity,
            'top_10':top_10}

results

{'tokens': 35456,
 'unique_tokens': 13538,
 'avg_token_length': 6.158506317689531,
 'lexical_diversity': 0.3818253610108303,
 'top_20': [('england', 864),
  ('united', 291),
  ('love', 255),
  ('kingdom', 248),
  ('london', 181),
  ('uk', 179),
  ('views', 138),
  ('fan', 134),
  ('rugby', 128),
  ('scotland', 127)]}

In [16]:
guinness_df = pd.read_csv("GuinnessGB_followers.txt", encoding='utf-8' ,delimiter="\t")
#guinness_df

guinness_df['location'].value_counts()[:20].sort_values(ascending=False)

London, England             121
England, United Kingdom     101
London                       81
United Kingdom               74
Lagos, Nigeria               41
Manchester, England          36
Dublin City, Ireland         28
UK                           26
Glasgow, Scotland            26
Scotland, United Kingdom     25
Birmingham, England          24
Wales, United Kingdom        23
England                      22
South East, England          19
Nigeria                      18
Edinburgh, Scotland          17
North West, England          17
Liverpool, England           15
Leeds, England               14
Manchester                   14
Name: location, dtype: int64

**Guinness Token Analysis**

The first surprise about Guinness was that the United States was not one of the top 20 locations. Most of the followers are from some reagion of the England. The second most surprising fact about Guinness followers was that Lagos, Nigeria was right after England. Another observation that is interesting is that Dublin City has 28 followers of Guiness, but no other city in Ireland makes the top 20. That could be because of population however. One more thing I noticed was that Guiness followers descriptions have the highest token length (6.16) but one of the lower lexical diversity count (0.38). This might have to do with the language difference. The final thing was that "rugby" was on the top 10 words used, and "football" was not. I also notice that Guinness followers seem to have more unique tokens than other beer besides Heineken. 

Cleaning text file for __Heineken USA__ followers and tokenizing

In [17]:
heineken = open("Heineken_US_followers.txt", encoding='utf-8').read() # reading in text file

heineken_clean = [w for w in heineken.lower().split()] # cast words to lowercase

heineken_clean = [w.lower() for w in heineken_clean if w.isalpha() and w not in sw] # remove
#stopwords and make sure words are just letters

In [18]:
total_tokens = len(heineken_clean)
unique_tokens = len(set(heineken_clean))
lex_diversity = len(set(heineken_clean))/len(heineken_clean)
avg_token_len = np.mean([len(w) for w in heineken_clean])
top_10 = Counter(heineken_clean).most_common(10)

results = {'tokens':total_tokens,
            'unique_tokens':unique_tokens,
            'avg_token_length':avg_token_len,
            'lexical_diversity':lex_diversity,
            'top_10':top_10}

results

{'tokens': 31882,
 'unique_tokens': 13872,
 'avg_token_length': 5.876544758798068,
 'lexical_diversity': 0.435104447650712,
 'top_20': [('de', 398),
  ('brasil', 383),
  ('usa', 298),
  ('e', 203),
  ('love', 203),
  ('new', 184),
  ('south', 167),
  ('brazil', 144),
  ('africa', 135),
  ('life', 120)]}

In [45]:
heineken_df = pd.read_csv("Heineken_US_followers.txt", encoding='utf-8' ,delimiter="\t")
#heineken_df

heineken_df['location'].value_counts()[:20].sort_values(ascending=False)

United States                 71
Chicago, IL                   38
Los Angeles, CA               29
California, USA               29
Lagos, Nigeria                28
Sao Paulo, Brazil             28
USA                           25
São Paulo, Brasil             25
New York, NY                  22
Rio de Janeiro, Brazil        21
Florida, USA                  21
Brasil                        21
Texas, USA                    20
South Africa                  19
Rio de Janeiro, Brasil        18
Brazil                        18
Johannesburg, South Africa    17
Nigeria                       17
New Jersey, USA               16
Las Vegas, NV                 14
Name: location, dtype: int64

**Heineken Token Analysis**

Heineken followers are of course in some of the larger cities in the US. However most of the followers seem to be from Africa and Brasil. This is known from the top words, and the location count. I am a soccer fan and know that Heineken sponsored the last few World Cups, so this might have to do with the popularity in Brazil and Africa who were hosts. Also I noticed that Heineken follower descriptions have the highest lexical diversity (0.44). 

Cleaning text file for __Labatt USA__ followers and tokenizing

In [20]:
labatt = open("LabattUSA_followers.txt", encoding='utf-8').read() # reading in text file

labatt_clean = [w for w in labatt.lower().split()] # cast words to lowercase

labatt_clean = [w.lower() for w in labatt_clean if w.isalpha() and w not in sw] # remove
#stopwords and make sure words are just letters

In [21]:
total_tokens = len(labatt_clean)
unique_tokens = len(set(labatt_clean))
lex_diversity = len(set(labatt_clean))/len(labatt_clean)
avg_token_len = np.mean([len(w) for w in labatt_clean])
top_10 = Counter(labatt_clean).most_common(10)

results = {'tokens':total_tokens,
            'unique_tokens':unique_tokens,
            'avg_token_length':avg_token_len,
            'lexical_diversity':lex_diversity,
            'top_10':top_10}

results

{'tokens': 32830,
 'unique_tokens': 12191,
 'avg_token_length': 5.8630520865062445,
 'lexical_diversity': 0.3713371915930551,
 'top_20': [('ny', 879),
  ('usa', 292),
  ('new', 271),
  ('buffalo', 253),
  ('sports', 208),
  ('hockey', 185),
  ('pa', 167),
  ('bills', 147),
  ('love', 147),
  ('mi', 145)]}

In [22]:
labatt_df = pd.read_csv("LabattUSA_followers.txt", encoding='utf-8' ,delimiter="\t")
#labatt_df

labatt_df['location'].value_counts()[:20].sort_values(ascending=False)

Buffalo, NY          350
United States         78
Chicago, IL           40
Pittsburgh, PA        38
New York, USA         37
Michigan, USA         35
Pennsylvania, USA     29
Rochester, NY         29
Philadelphia, PA      28
USA                   23
New York, NY          21
Detroit, MI           18
Boston, MA            18
Charlotte, NC         18
New York              18
Cleveland, OH         17
Columbus, OH          16
Michigan              16
Chicago               15
Syracuse, NY          14
Name: location, dtype: int64

**Labatt USA Token Analysis**

First thing I noticed about Labatt USA followers was that they had the lowest lexical diversity (0.37). The next thing I noticed was Canada was not one of the top locations for followers. It could be because this is the Labatt USA twitter account. Buffalo, NY however which is close to Canada has an overwhelming number of followers(350) compared to the rest of the country. When I look at the top words, I see "Buffalo" and "Bills" on the list which is the NFL team there. 350 followers was triple the size of any other beers top location. It would be interesting to look into why Labatt beer is so popular there. I should also note that "Hockey" is a top word in the descriptions as well, so atleast there is some Canadian reference from these followers.  

Cleaning text file for __Michelob Ultra__ followers and tokenizing

In [23]:
michelob = open("MichelobULTRA_followers.txt", encoding='utf-8').read() # reading in text file

michelob_clean = [w for w in michelob.lower().split()] # cast words to lowercase

michelob_clean = [w.lower() for w in michelob_clean if w.isalpha() and w not in sw] # remove
#stopwords and make sure words are just letters

In [24]:
total_tokens = len(michelob_clean)
unique_tokens = len(set(michelob_clean))
lex_diversity = len(set(michelob_clean))/len(michelob_clean)
avg_token_len = np.mean([len(w) for w in michelob_clean])
top_10 = Counter(michelob_clean).most_common(10)

results = {'tokens':total_tokens,
            'unique_tokens':unique_tokens,
            'avg_token_length':avg_token_len,
            'lexical_diversity':lex_diversity,
            'top_10':top_10}

results

{'tokens': 28316,
 'unique_tokens': 11818,
 'avg_token_length': 5.824268964543014,
 'lexical_diversity': 0.4173612092103404,
 'top_20': [('usa', 376),
  ('love', 224),
  ('ca', 220),
  ('tx', 161),
  ('life', 143),
  ('new', 120),
  ('fl', 114),
  ('sports', 114),
  ('los', 111),
  ('la', 105)]}

In [25]:
michelob_df = pd.read_csv("MichelobULTRA_followers.txt", encoding='utf-8' ,delimiter="\t")
#michelob_df

michelob_df['location'].value_counts()[:20].sort_values(ascending=False)

United States          72
Los Angeles, CA        62
California, USA        38
Houston, TX            32
USA                    26
Texas, USA             26
Chicago, IL            23
Miami, FL              21
Missouri, USA          20
Florida, USA           19
Boston, MA             15
North Carolina, USA    15
Atlanta, GA            15
Los Angeles            14
Austin, TX             14
New Jersey, USA        13
New York, NY           13
Las Vegas, NV          13
Phoenix, AZ            13
Virginia, USA          12
Name: location, dtype: int64

**Michelob Ultra Token Analysis**

This might just be the most boring beer on this list. None of the top words stand out, besides "sports" and the top locations are just the largest cities in the United States.

Cleaning text file for __Miller Light__ followers and tokenizing

In [26]:
miller = open("MillerLite_followers.txt", encoding='utf-8').read() # reading in text file

miller_clean = [w for w in miller.lower().split()] # cast words to lowercase

miller_clean = [w.lower() for w in miller_clean if w.isalpha() and w not in sw] # remove
#stopwords and make sure words are just letters

In [27]:
total_tokens = len(miller_clean)
unique_tokens = len(set(miller_clean))
lex_diversity = len(set(miller_clean))/len(miller_clean)
avg_token_len = np.mean([len(w) for w in miller_clean])
top_10 = Counter(miller_clean).most_common(10)

results = {'tokens':total_tokens,
            'unique_tokens':unique_tokens,
            'avg_token_length':avg_token_len,
            'lexical_diversity':lex_diversity,
            'top_10':top_10}

results

{'tokens': 29681,
 'unique_tokens': 11677,
 'avg_token_length': 5.787001785654122,
 'lexical_diversity': 0.3934166638590344,
 'top_20': [('usa', 481),
  ('love', 239),
  ('tx', 159),
  ('il', 144),
  ('fan', 128),
  ('new', 128),
  ('life', 124),
  ('sports', 122),
  ('like', 105),
  ('pa', 96)]}

In [28]:
miller_df = pd.read_csv("MillerLite_followers.txt", encoding='utf-8' ,delimiter="\t", error_bad_lines=False)
#miller_df

miller_df['location'].value_counts()[:20].sort_values(ascending=False)

b'Skipping line 3109: expected 7 fields, saw 8\n'


Chicago, IL            77
United States          66
Texas, USA             34
Ohio, USA              23
California, USA        23
North Carolina, USA    22
Illinois, USA          20
Florida, USA           20
Pennsylvania, USA      20
Michigan, USA          20
USA                    19
Wisconsin, USA         18
Milwaukee, WI          18
Los Angeles, CA        18
Houston, TX            18
Tennessee, USA         17
Kentucky, USA          15
New York, NY           14
Virginia, USA          14
Philadelphia, PA       14
Name: location, dtype: int64

**Miller Light Token Analysis**

Miller light has one of the lower lexical diversity (0.39)  and average token length (5.79). I also noticed that this beer is very popular in the Midwest states of Illinois, Wisconsin, Ohio and Pennsylvania. Also I notice that North Carolina seems to prefer this beer over the others on the list. 

Cleaning text file for __Modelo USA__ followers and tokenizing

In [29]:
modelo = open("ModeloUSA_followers.txt", encoding='utf-8').read() # reading in text file

modelo_clean = [w for w in modelo.lower().split()] # cast words to lowercase

modelo_clean = [w.lower() for w in modelo_clean if w.isalpha() and w not in sw] # remove
#stopwords and make sure words are just letters

In [30]:
total_tokens = len(modelo_clean)
unique_tokens = len(set(modelo_clean))
lex_diversity = len(set(modelo_clean))/len(modelo_clean)
avg_token_len = np.mean([len(w) for w in modelo_clean])
top_10 = Counter(modelo_clean).most_common(10)

results = {'tokens':total_tokens,
            'unique_tokens':unique_tokens,
            'avg_token_length':avg_token_len,
            'lexical_diversity':lex_diversity,
            'top_10':top_10}

results

{'tokens': 31604,
 'unique_tokens': 12501,
 'avg_token_length': 5.854607011770662,
 'lexical_diversity': 0.39555119605113276,
 'top_20': [('usa', 441),
  ('ca', 397),
  ('tx', 263),
  ('love', 204),
  ('los', 156),
  ('il', 147),
  ('new', 146),
  ('san', 133),
  ('de', 118),
  ('life', 114)]}

In [31]:
modelo_df = pd.read_csv("ModeloUSA_followers.txt", encoding='utf-8' ,delimiter="\t", error_bad_lines=False)
#modelo_df

modelo_df['location'].value_counts()[:20].sort_values(ascending=False)

b'Skipping line 3022: expected 7 fields, saw 8\n'


Chicago, IL            97
United States          86
California, USA        81
Los Angeles, CA        73
Houston, TX            39
Texas, USA             39
Florida, USA           37
Dallas, TX             36
San Antonio, TX        33
Austin, TX             28
Las Vegas, NV          25
USA                    22
New York, NY           20
Chicago                20
Atlanta, GA            20
San Diego, CA          18
Phoenix, AZ            17
Illinois, USA          16
North Carolina, USA    16
Denver, CO             16
Name: location, dtype: int64

**Modelo USA Token Analysis**

The first thing that pops out to me is that Chicago is the top location for Modelo followers. The states I would expect like California, Texas, Nevada, and Arizona come next. Just strange that Chicago has more followers. It is also interesting that Mexico is not one of the top locations on this list. 

Cleaning text file for __Natural Light__ followers and tokenizing

In [32]:
nati = open("naturallight_followers.txt", encoding='utf-8').read() # reading in text file

nati_clean = [w for w in nati.lower().split()] # cast words to lowercase

nati_clean = [w.lower() for w in nati_clean if w.isalpha() and w not in sw] # remove
#stopwords and make sure words are just letters

In [33]:
total_tokens = len(nati_clean)
unique_tokens = len(set(nati_clean))
lex_diversity = len(set(nati_clean))/len(nati_clean)
avg_token_len = np.mean([len(w) for w in nati_clean])
top_10 = Counter(nati_clean).most_common(10)

results = {'tokens':total_tokens,
            'unique_tokens':unique_tokens,
            'avg_token_length':avg_token_len,
            'lexical_diversity':lex_diversity,
            'top_10':top_10}

results

{'tokens': 27502,
 'unique_tokens': 11251,
 'avg_token_length': 5.72867427823431,
 'lexical_diversity': 0.4090975201803505,
 'top_20': [('usa', 475),
  ('love', 193),
  ('oh', 133),
  ('tx', 130),
  ('new', 111),
  ('pa', 107),
  ('university', 105),
  ('life', 104),
  ('united', 103),
  ('states', 94)]}

In [34]:
nati_df = pd.read_csv("naturallight_followers.txt", encoding='utf-8' ,delimiter="\t")
#nati_df

nati_df['location'].value_counts()[:20].sort_values(ascending=False)

United States          84
Ohio, USA              35
Texas, USA             35
Chicago, IL            31
USA                    24
Pittsburgh, PA         23
Florida, USA           22
Illinois, USA          21
Cincinnati, OH         19
Missouri, USA          19
Michigan, USA          19
Virginia, USA          18
North Carolina, USA    17
Indiana, USA           16
Los Angeles, CA        14
St Louis, MO           14
Iowa, USA              14
Cleveland, OH          14
Philadelphia, PA       13
Pennsylvania, USA      13
Name: location, dtype: int64

**Natural Light Token Analysis**

Natural Light followers have the shortest token lenght(5.73). It was also interesting that not many cities where counted in the top locations, and instead just states. Not sure why that would be. I also noticed that Natural Light followers where the only one on this list to have "University" as a top ten word. That makes sense.

Cleaning text file for __Pabst Blue Ribbon__ followers and tokenizing

In [35]:
pbr = open("PabstBlueRibbon_followers.txt", encoding='utf-8').read() # reading in text file

pbr_clean = [w for w in pbr.lower().split()] # cast words to lowercase

pbr_clean = [w.lower() for w in pbr_clean if w.isalpha() and w not in sw] # remove
#stopwords and make sure words are just letters

In [36]:
total_tokens = len(pbr_clean)
unique_tokens = len(set(pbr_clean))
lex_diversity = len(set(pbr_clean))/len(pbr_clean)
avg_token_len = np.mean([len(w) for w in pbr_clean])
top_10 = Counter(pbr_clean).most_common(10)

results = {'tokens':total_tokens,
            'unique_tokens':unique_tokens,
            'avg_token_length':avg_token_len,
            'lexical_diversity':lex_diversity,
            'top_10':top_10}

results

{'tokens': 31554,
 'unique_tokens': 12766,
 'avg_token_length': 5.907650377131267,
 'lexical_diversity': 0.4045762819293909,
 'top_20': [('usa', 493),
  ('love', 171),
  ('beer', 147),
  ('new', 140),
  ('like', 111),
  ('wi', 110),
  ('ca', 108),
  ('il', 108),
  ('sports', 105),
  ('pa', 104)]}

In [37]:
pbr_df = pd.read_csv("PabstBlueRibbon_followers.txt", encoding='utf-8' ,delimiter="\t")
#pbr_df

pbr_df['location'].value_counts()[:20].sort_values(ascending=False)

United States          77
Chicago, IL            57
Wisconsin, USA         38
Ohio, USA              31
Milwaukee, WI          31
Michigan, USA          27
California, USA        24
Los Angeles, CA        22
Pennsylvania, USA      21
Texas, USA             20
Minneapolis, MN        16
USA                    16
Chicago                16
Pittsburgh, PA         15
Philadelphia, PA       15
Atlanta, GA            15
Minnesota, USA         15
Indiana, USA           15
San Antonio, TX        14
North Carolina, USA    14
Name: location, dtype: int64

**Pabst Blue Ribbon Token Analysis**

First thing I noticed, was that out of all of these beers, only PBR and Yuengling had "beer" as a top ten word in follower descriptions. Another thing I noticed was that PBR has almost twice as many followers in the Midwest states than Miller Light, even though they are both orignially brewed in Wisonsin.

Cleaning text file for __Stella Artois__ followers and tokenizing

In [38]:
stella = open("StellaArtois_followers.txt", encoding='utf-8').read() # reading in text file

stella_clean = [w for w in stella.lower().split()] # cast words to lowercase

stella_clean = [w.lower() for w in stella_clean if w.isalpha() and w not in sw] # remove
#stopwords and make sure words are just letters

In [39]:
total_tokens = len(stella_clean)
unique_tokens = len(set(stella_clean))
lex_diversity = len(set(stella_clean))/len(stella_clean)
avg_token_len = np.mean([len(w) for w in stella_clean])
top_10 = Counter(stella_clean).most_common(10)

results = {'tokens':total_tokens,
            'unique_tokens':unique_tokens,
            'avg_token_length':avg_token_len,
            'lexical_diversity':lex_diversity,
            'top_110':top_10}

results

{'tokens': 29998,
 'unique_tokens': 12821,
 'avg_token_length': 5.960997399826655,
 'lexical_diversity': 0.42739515967731184,
 'top_20': [('usa', 390),
  ('love', 342),
  ('new', 190),
  ('de', 189),
  ('ca', 169),
  ('life', 135),
  ('united', 129),
  ('mom', 119),
  ('ny', 115),
  ('south', 105)]}

In [40]:
stella_df = pd.read_csv("StellaArtois_followers.txt", encoding='utf-8' ,delimiter="\t")
#stella_df

stella_df['location'].value_counts()[:20].sort_values(ascending=False)

United States                 86
USA                           63
Los Angeles, CA               50
New York, NY                  40
California, USA               36
Chicago, IL                   32
Florida, USA                  27
Texas, USA                    27
New York, USA                 18
Ohio, USA                     17
New Jersey, USA               15
North Carolina, USA           14
San Francisco, CA             12
England, United Kingdom       12
Miami, FL                     12
Johannesburg, South Africa    11
Georgia, USA                  10
Boston, MA                    10
Atlanta, GA                   10
Austin, TX                    10
Name: location, dtype: int64

**Stella Artois Token Analysis**

Stella like Amstell Light have the highest token length(5.96) and lexical diversity(0.43). I also noticed that Stella has a lot of followers in New York and New Jersey. New York followers seem to like the European beers link Stella, Amstell and Heineken better than other beers. It was also interesting that England was the only European location in the top 20 for Stella Artois. Another intersting thing is that "mom" was one of the top words from the descriptions. Wondering if Stella is a mom beer. 

Cleaning text file for __Yuengling Beer__ followers and tokenizing

In [41]:
#Cleaning text file for Yuengling followers

yuengling = open("yuenglingbeer_followers.txt", encoding='utf-8').read() # reading in text file

yuengling_clean = [w for w in yuengling.lower().split()] # cast words to lowercase

yuengling_clean = [w.lower() for w in yuengling_clean if w.isalpha() and w not in sw] # remove
#stopwords and make sure words are just letters

In [42]:
total_tokens = len(yuengling_clean)
unique_tokens = len(set(yuengling_clean))
lex_diversity = len(set(coors_clean))/len(yuengling_clean)
avg_token_len = np.mean([len(w) for w in yuengling_clean])
top_10 = Counter(yuengling_clean).most_common(10)

results = {'tokens':total_tokens,
            'unique_tokens':unique_tokens,
            'avg_token_length':avg_token_len,
            'lexical_diversity':lex_diversity,
            'top_10':top_10}

results

{'tokens': 33261,
 'unique_tokens': 12398,
 'avg_token_length': 5.914524518204503,
 'lexical_diversity': 0.35720513514326085,
 'top_20': [('usa', 534),
  ('pa', 390),
  ('love', 199),
  ('beer', 196),
  ('sports', 193),
  ('fan', 153),
  ('new', 149),
  ('tx', 145),
  ('life', 120),
  ('united', 111)]}

In [43]:
yuengling_df = pd.read_csv("yuenglingbeer_followers.txt", encoding='utf-8' ,delimiter="\t")
#yuengling_df

yuengling_df['location'].value_counts()[:20].sort_values(ascending=False)

Pennsylvania, USA    100
United States         87
Philadelphia, PA      48
New Jersey, USA       32
Pittsburgh, PA        30
Texas, USA            29
Chicago, IL           25
Florida, USA          25
Ohio, USA             22
Michigan, USA         18
Las Vegas, NV         17
Pottsville, PA        17
Lincoln, NE           16
California, USA       16
Kansas City, MO       15
Nashville, TN         15
Omaha, NE             14
Pennsylvania          14
Los Angeles, CA       14
Dallas, TX            14
Name: location, dtype: int64

**Yuengling Beer Token Analysis**

Last beer on the list Yuengling which has the most followers in Pennsylvania (PA). That makes sense since it is brewed there in Pottsville which has 17 folowers form there. There also seems to be a lot of followers from Nebraska which isn't on the other top locations for the rest of the beers. Also the followers of Yuengling had the lowest lexical diversity (0.36) in there descriptions.