Jake Gluck - Capital News Service - jagluck.github.io

NBA Twitter Analysis

In [110]:
#import libraries
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from io import StringIO
xrange = range
from tokenize import generate_tokens

In [111]:
#load twitter conv data gathered by the scraper
df = pd.read_csv('/Users/jagluck/Documents/GitHub/nba-twitter/nba-conv-good.csv')

NBACAPS = {'CAVS','OKCTHUNDER','CELTICS','NYKNICKS','BROOKLYNNETS','PELICANSNBA', 'PACERS', 'ORLANDOMAGIC','TIMBERWOLVES','MIAMIHEAT',
'HORNETS', 'DETROITPISTONS', 'DALLASMAVS', 'LACLIPPERS', 'LAKERS', 'UTAHJAZZ', 'NUGGETS', 'WASHWIZARDS', 'CHICAGOBULLS',
'SPURS', 'SUNS', 'HOUSTONROCKETS', 'WARRIORS', 'ATLHAWKS', 'MEMGRIZZ', 'BUCKS', 'RAPTORS', 'SACRAMENTOKINGS', 'SIXERS', 'TRAILBLAZERS'}

def parts(a):
    """Split a python-tokenizable expression on comma operators"""
    compos = [-1] # compos stores the positions of the relevant commas in the argument string
    compos.extend(t[2][1] for t in generate_tokens(StringIO(a).readline) if t[1] == ',')
    compos.append(len(a))
    return [ a[compos[i]+1:compos[i+1]] for i in xrange(len(compos)-1)]

#coverts to list of strings
def convertType(toConv, makeUpper):
    
    tcs = []
    for tc in toConv:
        chain = parts(tc)
        
        chain2 = []
        for c in chain:
            c = c.strip('[')
            c = c.strip(']')
            c = c.strip(' \'')
            c = c.strip('\'')
            c = c.strip('@')
            chain2.append(c) 
        
        #make uppercase if specified
        chain3 = []
        if makeUpper:
            for c in chain2:
                chain3.append(c.upper())
            tcs.append(chain3)
        else:
            tcs.append(chain2)
        
    return np.array(tcs)


# Helper: counts time value is in list of lists
def countContains(toCount, data):
    count = 0
    for x in data:
        if toCount in x:
            count = count + 1
    return count

# Helper: stores value when value in list of list is found
def countLengths(toCount, data1, data2):
    count = []
    for x, y in zip(data1, data2):
        if toCount in x:
            count.append(y)
    return np.mean(count)
    
# convert types  
df = df.drop(['Unnamed: 0'], axis=1)
df = df.drop(['to'], axis=1)
df = df.drop(['from'], axis=1)
df['names'] = convertType(df['names'], False)
df['usernames'] = convertType(df['usernames'], True)
df['dates'] = convertType(df['dates'], True)
df['tweets'] = convertType(df['tweets'], True)

lens = []
for ob in df["names"]:
    lens.append(len(ob))
df["num_parties"] = lens


two_parties = df.loc[df["num_parties"] == 2]
sort = df.sort_values(by='num_parties', ascending=False) 

In [112]:
## What is the typical converation size? ##########
df['size'].describe()

count    1917.000000
mean        2.838289
std         2.161278
min         2.000000
25%         2.000000
50%         2.000000
75%         3.000000
max        52.000000
Name: size, dtype: float64

In [113]:
### What teams Tweet the most #######
names = []
participations = []
lengths = []
for name in NBACAPS:
    names.append(name)
    participations.append(countContains(name,df['usernames']))
    lengths.append(countLengths(name,df['usernames'],df['size']))

teams = pd.DataFrame(
        {'name': names,
         'participation': participations,
         'avg_lengths': lengths
        })
    
teams = teams.sort_values(by='participation', ascending=False)
print(teams)
print("Participation " + str(np.mean(teams['participation'])))

#What teams have the longest and shortest conversations
teams = teams.sort_values(by='avg_lengths', ascending=False)
print(teams)

#What teams talk to other teams the most/least? 

def countDuoConvos(name, name2):
    count = 0
    for x in df['usernames']:
        if name in x:
            if name2 in x:
                count = count + 1
    return count
 
tos = []
froms = []
convo_counts = []

for name in NBACAPS:
    for name2 in NBACAPS:
        if name != name2:
            if name < name2:
                tos.append(name)
                froms.append(name2)
                convo_counts.append(countDuoConvos(name, name2))
                
convos = pd.DataFrame(
        {'team_1': tos,
         'team_2': froms,
         'convo_count': convo_counts
        })
 
convos = convos.sort_values(by='convo_count', ascending=False)       
print(convos)
print(np.mean(convos['convo_count']))

    avg_lengths             name  participation
9      2.637813     TRAILBLAZERS            439
11     3.361858         ATLHAWKS            409
25     2.786184  SACRAMENTOKINGS            304
15     3.848214             SUNS            224
3      2.914414     TIMBERWOLVES            222
28     3.505208           SIXERS            192
10     3.021505         UTAHJAZZ            186
17     3.410811            BUCKS            185
23     3.070652         MEMGRIZZ            184
12     3.121547       LACLIPPERS            181
21     3.186335         WARRIORS            161
27     3.441176          NUGGETS            136
4      2.984615     ORLANDOMAGIC            130
0      2.984000      WASHWIZARDS            125
8      2.761468          RAPTORS            109
6      2.622642      PELICANSNBA            106
22     4.448276       DALLASMAVS             87
2      2.465116     BROOKLYNNETS             86
14     3.487179   DETROITPISTONS             78
19     2.986667             CAVS        

In [117]:
# What percent of each team is sent to other teams

team_name = []
to_name = []
percent = []
for t_index, t_row in teams.iterrows():
    total = t_row['participation']
    name = t_row['name']
    for c_index, c_row in convos.iterrows():
        if (name == c_row['team_1']):
            team_name.append(name)
            to_name.append(c_row['team_2'])
            count = c_row['convo_count']
            if (count == 0):
                percent.append(0)
            else:
                percent.append(c_row['convo_count']/total)
            
convo_percent = pd.DataFrame(
        {'from': team_name,
         'to': to_name,
         'percent': percent
        })
        
convo_percent = convo_percent.sort_values(by='percent', ascending=False)  
hawks = convo_percent.loc[convo_percent["from"] == "ATLHAWKS"]
with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
    print(convo_percent)

                from   percent               to
49        OKCTHUNDER  0.411765     TRAILBLAZERS
62           CELTICS  0.257143     TRAILBLAZERS
362     BROOKLYNNETS  0.255814     TRAILBLAZERS
417           LAKERS  0.250000     TRAILBLAZERS
308   HOUSTONROCKETS  0.215385     TRAILBLAZERS
202       LACLIPPERS  0.204420     TRAILBLAZERS
278     CHICAGOBULLS  0.200000             SUNS
280     CHICAGOBULLS  0.200000  SACRAMENTOKINGS
279     CHICAGOBULLS  0.200000     TRAILBLAZERS
352      PELICANSNBA  0.179245     TRAILBLAZERS
87            SIXERS  0.177083             SUNS
406           PACERS  0.171429     TRAILBLAZERS
309   HOUSTONROCKETS  0.169231  SACRAMENTOKINGS
145            BUCKS  0.167568     TRAILBLAZERS
419           LAKERS  0.166667     TIMBERWOLVES
418           LAKERS  0.166667  SACRAMENTOKINGS
240             CAVS  0.160000  SACRAMENTOKINGS
132         NYKNICKS  0.157895  SACRAMENTOKINGS
133         NYKNICKS  0.157895     TRAILBLAZERS
134         NYKNICKS  0.157895         U

In [115]:
# How has the number of conversations changed over time?

years = []
for index, row in df.iterrows():
    year = row['dates'][0][-4:]
    years.append(year)

output = set()
for x in years:
    output.add(x)
output = sorted(list(output))
f_years = []
counts = []

for x in output:
    f_years.append(x)
    counts.append(years.count(x))
    
year_count = pd.DataFrame(
        {'year': f_years,
         'count': counts
        }) 
        
print(year_count)

   count  year
0      8  2012
1     41  2013
2    310  2014
3    518  2015
4    427  2016
5    613  2017


# What was the language of the tweets like/ \sentiment analysis

tweets = df['tweets']
names = df['usernames']
text = []
team = []

for tweet, name in zip(tweets,names):

    for c in tweet:
        text.append(c)
        
    for n in name:
        team.append(n)

neg = []
pos = []
neu = []
comp = []

analyzer = SentimentIntensityAnalyzer()

for t in text:
   vs = analyzer.polarity_scores(t)
   neg.append(vs['neg'])
   pos.append(vs['pos'])
   neu.append(vs['neu'])
   comp.append(vs['compound'])

all_tweets = pd.DataFrame(
        {'text': text,
         'team': team,
         'neg' : neg,
         'pos' : pos,
         'neu' : neu,
         'comp' : comp
        }) 

all_tweets=all_tweets[all_tweets['team'].isin(NBACAPS)]

print("Pos Avg: " + str(all_tweets['pos'].mean()))
print("Pos Max: " + str(all_tweets['pos'].max()))

print("Neg Avg: " + str(all_tweets['neg'].mean()))
print("Neg Max: " + str(all_tweets['neg'].max()))

print("Neu Avg: " + str(all_tweets['neu'].mean()))

print("Comp Avg: " + str(all_tweets['comp'].mean()))

negative = all_tweets[all_tweets['neg'] == 1]

print("\nMost Negative Tweets")
print(negative)

positive = all_tweets[all_tweets['pos'] == 1]
print("\nMost Positive Tweets")
print(positive)

pos_groups = all_tweets.groupby('team')['pos'].mean()
pos_groups = pos_groups.sort_values(ascending=False)
print("\nTeam Pos Avg")
print(pos_groups)

neg_groups = all_tweets.groupby('team')['neg'].mean()
neg_groups = neg_groups.sort_values(ascending=False)
print("\nTeam Neg Avg")
print(neg_groups)

comp_groups = all_tweets.groupby('team')['comp'].mean()
comp_groups = comp_groups.sort_values(ascending=False)
print("\nTeam Comp Avg")
print(comp_groups)