# Languages used in conversations

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import csv
from source.conversation_helper import conversation_filter

In [4]:
ds = pd.read_csv('/Users/john/data/twitter/tweets_ec_nd_conv.csv', 
                 dtype={
                     'id': np.int64,
                     'conversation_id': np.int64,
                     'screen_name': str,
                     'in_reply_to_status_id': object,
                     'text': str,
                     'num_child_replies': np.int32
                       })
conv = conversation_filter(ds)

In [5]:

def show_stats():
    num_tweets = ds.shape[0]
    num_tweets_conv = conv.conversation_id.nunique()
    num_langs = conv.lang.nunique()
    stats = pd.DataFrame({
        '': [num_tweets, num_tweets_conv, num_langs]},
        index= ['tweets', 'conversations', 'languages'])
    return stats

In [6]:
show_stats()

Unnamed: 0,Unnamed: 1
tweets,2716464
conversations,286876
languages,44


In [7]:
def show_lang_stats():
    langs=conv.groupby('lang').size()
    langs=langs.reset_index(name='tweets')
    langs=langs.sort_values('tweets', ascending=False)
    total = langs.tweets.sum()
    langs['percentage'] = langs.tweets / total
    langs['pcum'] = langs.tweets.cumsum() / total
    return langs.head()

In [8]:
show_lang_stats()

Unnamed: 0,lang,tweets,percentage,pcum
8,es,723733,0.7915,0.7915
40,und,66231,0.072433,0.863933
7,en,59329,0.064884,0.928817
20,ja,33441,0.036572,0.96539
29,pt,10129,0.011077,0.976467


there is a code UND, according to ISO is undetermined, let look examples

In [12]:
conv[conv.lang=='und'][['text']].sample(10).values

array([[ '@terrykakiuchi7 @conde_leoconde @PolishRoyalGoat @FJ_Bering @mariamedinacas2 @fukayaqui @awlasky @salseritomc @Carpey66Fran 👄💋'],
       ['@nmosqueraa @theendlessjoke https://t.co/4MzUtFhqfp'],
       ['@velozita8 ssh'],
       ['@SoyAlejMusic siii'],
       ['#SiTeDigoLaVerdadALaCarcel ! https://t.co/6tb6QVR2sP'],
       ['@jenniehidalgo Ñi'],
       ['@myrabatchelder @fumiplagg'],
       ['@AndrexWorld Gn ^^'],
       ['@gabrielaolanota https://t.co/Z9t1DlOJps'],
       ['@WeedmapsEs @Cannabis24h']], dtype=object)

In [13]:
conv_root = conv[(conv.lang=='und') & (conv.in_reply_to_status_id.isnull())]
conv_root.head()

Unnamed: 0,id,screen_name,name,statuses_count,followers_count,friends_count,favourites_count,time_zone,utc_offset,geo_enabled,...,in_reply_to_screen_name,created_at,source,text,media_url,hashtags,conversation_id,num_replies,num_users,num_tweets
245,838976100160188416,danieelaac,Daniela Abarca,33670,978,89,2750,Central Time (US & Canada),-21600.0,True,...,carlacortesb,Tue Mar 07 04:54:24 +0000 2017,"<a href=""http://twitter.com/download/iphone"" r...",@carlacortesb ya https://t.co/b5C2ZkbnxZ,http://pbs.twimg.com/ext_tw_video_thumb/838975...,,838976100160188416,3,2,3
485,838985524874473472,sipionreyes,Xiomara Sipion,26875,498,200,9518,,,True,...,,Tue Mar 07 05:31:51 +0000 2017,"<a href=""http://twitter.com/download/android"" ...",😢😢😢,,,838985524874473472,12,2,12
568,838994174686932992,FelixMoranS,Felix Moran,278,109,104,67,,,False,...,JorgeRojasCr,Tue Mar 07 06:06:13 +0000 2017,"<a href=""http://twitter.com/download/android"" ...",@JorgeRojasCr https://t.co/0BSCiDanLu,,,838994174686932992,2,2,2
610,839001123340095488,Santinovelota,Santino Velota,57069,22798,21948,20071,,,True,...,,Tue Mar 07 06:33:50 +0000 2017,"<a href=""http://twitter.com/download/android"" ...",https://t.co/LChhdB89qn,http://pbs.twimg.com/media/C6S7EmBWMAMWqTw.jpg,,839001123340095488,2,2,2
622,838936166112116736,eagleventura,El Trujillano,162207,2604,2045,2549,Eastern Time (US & Canada),-14400.0,True,...,,Tue Mar 07 02:15:43 +0000 2017,"<a href=""http://twitter.com/download/android"" ...",#AdiosEntel \n#AdiosEntel \n@EntelPeru https:/...,http://pbs.twimg.com/media/C6R_-VNWgAEaXgh.jpg,AdiosEntel;AdiosEntel,838936166112116736,2,2,2


undefined language , often associated with URL means there is a video or photo, let look how many conversations are in multiple languages

In [14]:
def show_lang_distribution():
    langs = conv.groupby('conversation_id').agg({'lang': 'nunique'})
    langs.reset_index(inplace=True)
    langs=langs.groupby('lang').size()
    langs=langs.sort_values(ascending=False)
    langs=langs.reset_index()
    langs.columns=['languages', 'conversations']
    total = langs.conversations.sum()
    langs['percentage'] = langs.conversations / total
    langs['pcum'] = langs.conversations.cumsum() / total
    langs = langs.head(3)
    langs.to_csv('results/lang_distribution.csv', index=False)
    return langs

In [15]:
show_lang_distribution()

Unnamed: 0,languages,conversations,percentage,pcum
0,1,216138,0.75342,0.75342
1,2,62992,0.219579,0.972999
2,3,6165,0.02149,0.994489
