In [76]:
import json
import tqdm
import pandas as pd
import langdetect
from deep_translator import GoogleTranslator
import matplotlib.pyplot as plt
from matplotlib import rcParams

In [77]:
#rcParams default settings
"https://matplotlib.org/stable/tutorials/introductory/customizing.html"

#rcParams settings
plt.style.use('ggplot')

rcParams['font.family'] = 'sans-serif'
rcParams['font.style'] = 'normal'

rcParams['figure.facecolor'] = 'white'

rcParams['savefig.bbox'] = 'tight'
rcParams['savefig.dpi'] = 300
rcParams['savefig.transparent'] = True

rcParams['axes.spines.right'] = False
rcParams['axes.spines.top'] = False
rcParams['axes.labelsize'] = 20
rcParams['axes.labelcolor'] = 'black'
rcParams['axes.edgecolor'] = 'grey'
rcParams['axes.linewidth'] = 3
rcParams['axes.facecolor'] = 'white'
rcParams['axes.titlepad'] = 4

rcParams['xtick.color'] = 'grey'
rcParams['ytick.color'] = 'grey'
rcParams['xtick.major.width'] = 2
rcParams['ytick.major.width'] = 0
rcParams['xtick.major.size'] = 5
rcParams['ytick.major.size'] = 0

rcParams['lines.linewidth'] = 3
rcParams['lines.markersize'] = 10

rcParams['grid.color'] = 'grey'
rcParams['grid.linewidth'] = 0.1

In [78]:
def get_language(text):
    try:
        return langdetect.detect(text)
    except KeyboardInterrupt as e:
        raise(e)
    except:
        return '<-- ERROR -->'
    
def get_translation(text):
    try:
        return GoogleTranslator(source='auto', target='en')\
          .translate(str(text))
    except KeyboardInterrupt as e:
        raise(e)
    except:
        return '<-- ERROR -->'

In [79]:
# Load the JSON file into a DataFrame
with open('data/out/filtered_tweets_2.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Create a DataFrame from the JSON data
df = pd.DataFrame(data)  # Transpose to set tweets as rows

df

Unnamed: 0,attachments,author_id,created_at,edit_history_tweet_ids,entities,id,lang,public_metrics,text,username,geo
0,{'media_keys': ['13_1610292861630840833']},51801586,2023-01-03T15:18:41.000Z,[1610294585242300416],"{'annotations': [{'end': 6, 'normalized_text':...",1610294585242300416,en,"{'bookmark_count': 0, 'impression_count': 3864...",New ABB E-mobility home charging solution help...,abbgroupnews,
1,{'media_keys': ['3_1612750081463095296']},51801586,2023-01-10T09:57:29.000Z,[1612750467859378178],"{'annotations': [{'end': 2, 'normalized_text':...",1612750467859378178,en,"{'bookmark_count': 0, 'impression_count': 4887...",ABB #technology to facilitate #energyefficienc...,abbgroupnews,
2,{'media_keys': ['13_1613112174939103235']},51801586,2023-01-11T10:01:06.000Z,[1613113765595320321],"{'annotations': [{'end': 51, 'normalized_text'...",1613113765595320321,en,"{'bookmark_count': 0, 'impression_count': 2559...",Better decisions for a greener data center! Ho...,abbgroupnews,
3,{'media_keys': ['3_1613462741569736704']},51801586,2023-01-12T09:09:45.000Z,[1613463229665271809],"{'annotations': [{'end': 19, 'normalized_text'...",1613463229665271809,en,"{'bookmark_count': 0, 'impression_count': 2087...",ABB Formula E Season 9 goes green in Mexico!\n...,abbgroupnews,
4,{'media_keys': ['13_1613831618141261825']},51801586,2023-01-13T09:41:43.000Z,[1613833662625353729],"{'annotations': [{'end': 7, 'normalized_text':...",1613833662625353729,en,"{'bookmark_count': 0, 'impression_count': 1975...",The #ABB Decoded #podcast with Frank Muehlon i...,abbgroupnews,
...,...,...,...,...,...,...,...,...,...,...,...
6125,{'media_keys': ['3_1734966670068690949']},156646851,2023-12-13T16:03:45.000Z,[1734967350619701374],"{'annotations': [{'end': 62, 'normalized_text'...",1734967350619701374,en,"{'bookmark_count': 1, 'impression_count': 2549...",🌎: 2023 was the hottest year in recorded histo...,vestas,
6126,{'media_keys': ['3_1737133536577015808']},156646851,2023-12-19T15:32:30.000Z,[1737133815452074094],"{'annotations': [{'end': 61, 'normalized_text'...",1737133815452074094,en,"{'bookmark_count': 1, 'impression_count': 1905...","🇪🇺: To deliver clean, affordable, and secure e...",vestas,
6127,{'media_keys': ['3_1737491113206800384']},156646851,2023-12-20T15:13:16.000Z,[1737491360804938169],"{'annotations': [{'end': 160, 'normalized_text...",1737491360804938169,en,"{'bookmark_count': 1, 'impression_count': 4680...",🌎 /🇬🇧 : In the making: one of the largest offs...,vestas,
6128,,156646851,2023-12-22T11:16:00.000Z,[1738156427468763517],"{'annotations': [{'end': 59, 'normalized_text'...",1738156427468763517,en,"{'bookmark_count': 0, 'impression_count': 992,...",🌎/🇬🇧: Partnering with @RWE_AG to take the Nor...,vestas,


In [80]:
print(f'The number of tweets is {len(df)}')

The number of tweets is 6130


In [81]:
df = df[['username', 'lang', 'text', 'created_at']]

In [82]:
print(f"We have {len(df)} tweets in total, where {len(df.loc[df.lang == 'en'])} are in English language.")

We have 6130 tweets in total, where 5332 are in English language.


In [83]:
#remove qme #media links 
df = df.loc[df.lang != 'qme']
#remove zxx #used when the language is unknown
df = df.loc[df.lang != 'zxx']
#If no language classification can be made for a Tweet
df = df.loc[df.lang != 'und']
#tweets with hashtags only
df = df.loc[df.lang != 'qht']
#lang:qam for tweets with mentions only (works for tweets since 2022-06
df = df.loc[df.lang != 'qam']
#questo non so che è ma lo toglierei
df = df.loc[df.lang != 'art']


In [84]:
no_english = df.loc[df.lang != 'en']

In [85]:
print(f"We have {len(no_english)} tweets in other languages.")

We have 681 tweets in other languages.


In [86]:
print(f"We have {len(no_english.username.unique())} different companies tweeting in non-english languages:")

We have 17 different companies tweeting in non-english languages:


In [87]:
for index, row in no_english.loc[no_english.lang=='ca'].iterrows():
    print(index)
    print(row['text'])

1598
@NetflixFR Objectivement : Zenitsu ⚡️
4098
#PRESS: Novo Nordisk to acquire molecule for uncontrolled hypertension from KBP Biosciences
4669
@AufarAditya Major congrats ✈️


In [88]:
no_english.loc[1598, 'lang'] = 'fr'
no_english.loc[4098, 'lang'] = 'en' 
no_english.loc[4669, 'lang'] = 'en' 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_english.loc[1598, 'lang'] = 'fr'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_english.loc[4098, 'lang'] = 'en'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_english.loc[4669, 'lang'] = 'en'


In [89]:
for index, row in no_english.loc[no_english.lang=='pl'].iterrows():
    print(index)
    print(row['text'])

1485
@PatrickAdemo Pikachu ⚡️
2561
@ChampionsLeague Two UEFA Champions League winners 🙌


In [90]:
for index, row in no_english.loc[no_english.lang=='pl'].iterrows():
    no_english.loc[index, 'lang'] = 'en'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_english.loc[index, 'lang'] = 'en'


In [91]:
for index, row in no_english.loc[no_english.lang=='ar'].iterrows():
    print(index)
    print(row['text'])

1909
@shex47171391 مرحبًا ، ليس لدينا رقم Whatsapp. إذا سمحت لي بمعرفة البلد الذي تتواجد فيه ، فقد أتمكن من توفير رابط لمعلومات الاتصال الخاصة بمكتبنا المحلي ، حيث يمكنهم مناقشة مخاوفك.

-Crystal
1914
@shex47171391 لقد قمنا مؤخرًا بتغيير نموذج أعمالنا في العراق. من الآن فصاعدًا ، نهدف إلى توفير الأدوية واللقاحات من شركة GSK مباشرة إلى وزارة الصحة ، أو بالشراكة مع المنظمات عبر الوطنية. سوف تحتاج إلى التحدث إلى وزارة الصحة بخصوص الأدوية.


In [92]:
for index, row in no_english.loc[no_english.lang=='cs'].iterrows():
    print(index)
    print(row['text'])

248
We're happy to provide ČD-Telematika with ETCS Level 2 in the Czech Republic and Slovakia 

▶️Learn more: https://t.co/jeNvzYeE7b

#signalling #ETCS https://t.co/RMXyrZIhUA


In [93]:
for index, row in no_english.loc[no_english.lang=='cs'].iterrows():
    no_english.loc[index, 'lang'] = 'en'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_english.loc[index, 'lang'] = 'en'


In [94]:
for index, row in no_english.loc[no_english.lang=='tl'].iterrows():
    print(index)
    print(row['text'])

1228
Massive thanks to all the other fantastic scientists featured in this video! Yang Huang, Xue-Fei Bai, Yang Huang, Monica Wanyi Xia, Jingyan Qu, Ivy Yuhan Li, Ann Zhu Chen, Grace Yuan Yuan, Anna Mei-Tong Lin, Daany Wenhao Li, Echo Yun Yu, and Yiming Ji 👏


In [95]:
for index, row in no_english.loc[no_english.lang=='tl'].iterrows():
    no_english.loc[index, 'lang'] = 'en'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_english.loc[index, 'lang'] = 'en'


In [96]:
for index, row in no_english.loc[no_english.lang=='ht'].iterrows():
    print(index)
    print(row['text'])

2565
@ChampionsLeague Griezmann is on fire 🔥


In [97]:
for index, row in no_english.loc[no_english.lang=='ht'].iterrows():
    no_english.loc[index, 'lang'] = 'en'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_english.loc[index, 'lang'] = 'en'


In [98]:
for index, row in no_english.loc[no_english.lang=='cy'].iterrows():
    print(index)
    print(row['text'])

2609
@Amy_rights Zero alcohol. 100% smoothness 😉


In [99]:
for index, row in no_english.loc[no_english.lang=='ht'].iterrows():
    no_english.loc[index, 'lang'] = 'en'

In [100]:
print(f"After manually cleaning the data we have {len(no_english)} tweets in languages that are not English.")

After manually cleaning the data we have 681 tweets in languages that are not English.


In [107]:
len(df.loc[df.lang=='en'])

5332

In [108]:
5332+681

6013

#### How many tweets for each language?

In [105]:
new_df = pd.concat([no_english,df.loc[df.lang=='en']])
new_df

Unnamed: 0,username,lang,text,created_at
138,acciona,es,.@ACCIONAEnergia está volcada en el mayor reto...,2023-11-11T11:30:00.000Z
139,acciona,es,Dos islas conectadas por un inmenso puente 🌉\n...,2023-11-12T11:00:00.000Z
140,acciona,es,Ya está disponible en los canales sociales de ...,2023-11-14T14:03:22.000Z
141,acciona,es,"𝐀𝐥𝐞𝐣𝐚𝐧𝐝𝐫𝐨 𝐁𝐞𝐢𝐯𝐢𝐝𝐞, Director de Transformación ...",2023-11-14T16:21:46.000Z
142,acciona,es,Las abejas de la ciudad recogen mucho más que ...,2023-11-14T21:00:01.000Z
...,...,...,...,...
6125,vestas,en,🌎: 2023 was the hottest year in recorded histo...,2023-12-13T16:03:45.000Z
6126,vestas,en,"🇪🇺: To deliver clean, affordable, and secure e...",2023-12-19T15:32:30.000Z
6127,vestas,en,🌎 /🇬🇧 : In the making: one of the largest offs...,2023-12-20T15:13:16.000Z
6128,vestas,en,🌎/🇬🇧: Partnering with @RWE_AG to take the Nor...,2023-12-22T11:16:00.000Z


In [106]:
len(new_df)

6013

#### Translate non-english texts

In [123]:
!pip install translate
!pip install googletrans==3.1.0-alpha



In [124]:
from translate import Translator
from tqdm.notebook import tqdm
from googletrans import Translator

translator = Translator()

For now I am using google translate because is free and has no limit, maybe I can try deepl but for now i just found an api with a small free translation rate per day

In [126]:
english_texts = []
for index, row in tqdm(no_english.iterrows(), total=len(no_english)):
    print(f'row index is {index}')
    translated = translator.translate(row['text'], src=row['lang'], dest="en").text
    print(f'translated text is: {translated}')
    english_texts.append(translated)
    print(f'number of translated text is: {len(english_texts)}')


  0%|          | 0/681 [00:00<?, ?it/s]

row index is 138
translated text is: .@ACCIONAEnergia is focused on the greatest photovoltaic challenge developed in its 30 years of life in the renewable sector: building 🇺🇸 solar plants with a combined power equivalent to a nuclear power plant, with more than two million panels 👇https://t .co/HPfeti7ABI
number of translated text is: 1
row index is 139
translated text is: Two islands connected by a huge bridge 🌉

But not just any one: 2.5 km long with a 650 m cable-stayed main structure. We are talking about the Cebu cable-stayed bridge, one of our largest engineering works in Southeast Asia 👇

https://t.co/CQywj6hV2g https://t.co/Yfv7RU8rfQ
number of translated text is: 2
row index is 140
translated text is: Episode 2 of the docuseries '#LaLuzQuePermanece' is now available on the social channels of @acciona_org. Don't miss the mysteries of the Amazon jungle and the commitment of the Copal Urco community to renewable energies 👇 https://t.co/Fmpu2s8bxi
number of translated text is: 3
r

ValueError: invalid source language

In [127]:
no_english.loc[2057]

username                                                    gsk
lang                                                         in
text          @iarumk Halo - Haleon mengambil alih divisi la...
created_at                             2023-04-17T15:39:56.000Z
Name: 2057, dtype: object

In [120]:
en_texts

['.@ACCIONAEnergia is focused on the greatest photovoltaic challenge developed in its 30 years of life in the renewable sector: building 🇺🇸 solar plants with a combined power equivalent to a nuclear power plant, with more than two million panels 👇https://t .co/HPfeti7ABI',
 'Two islands connected by a huge bridge 🌉\n\nBut not just any one: 2.5 km long with a 650 m cable-stayed main structure. We are talking about the Cebu cable-stayed bridge, one of our largest engineering works in Southeast Asia 👇\n\nhttps://t.co/CQywj6hV2g https://t.co/Yfv7RU8rfQ',
 "Episode 2 of the docuseries '#LaLuzQuePermanece' is now available on the social channels of @acciona_org. Don't miss the mysteries of the Amazon jungle and the commitment of the Copal Urco community to renewable energies 👇 https://t.co/Fmpu2s8bxi",
 "𝐀𝐥𝐞𝐣𝐚𝐧𝐝𝐫𝐨 𝐁𝐞𝐢𝐯𝐢𝐝𝐞, Director of Digital Infrastructure Transformation, will be present tomorrow, Wednesday, November 15, at 9:00 am, in Cantabria 🇪🇸 giving a presentation on Industrial Connec

In [None]:
len(en_texts)

798

In [50]:
for idx, text in zip(indexes, en_texts):
    no_english.loc[idx, 'en_text'] = text

In [52]:
no_english

Unnamed: 0,username,lang,text,created_at,en_text
138,acciona,es,.@ACCIONAEnergia está volcada en el mayor reto...,2023-11-11T11:30:00.000Z,.@ACCIONAEnergia is focused on the greatest ph...
139,acciona,es,Dos islas conectadas por un inmenso puente 🌉\n...,2023-11-12T11:00:00.000Z,Two islands connected by a huge bridge 🌉\n\nBu...
140,acciona,es,Ya está disponible en los canales sociales de ...,2023-11-14T14:03:22.000Z,Episode 2 of the docuseries '#LaLuzQuePermanec...
141,acciona,es,"𝐀𝐥𝐞𝐣𝐚𝐧𝐝𝐫𝐨 𝐁𝐞𝐢𝐯𝐢𝐝𝐞, Director de Transformación ...",2023-11-14T16:21:46.000Z,"𝐀𝐥𝐞𝐣𝐚𝐧𝐝𝐫𝐨 𝐁𝐞𝐢𝐯𝐢𝐝𝐞, Director of Digital Infrast..."
142,acciona,es,Las abejas de la ciudad recogen mucho más que ...,2023-11-14T21:00:01.000Z,City bees collect much more than pollen\n\n🐝🐝🐝...
...,...,...,...,...,...
6088,vestas,da,🇩🇰 / in Danish:\nVi har været med til at start...,2023-09-08T09:49:50.000Z,🇩🇰 / in Danish:\nWe have helped start the Alli...
6089,vestas,da,🇩🇰/ in Danish: I Danmark har vi besluttet at f...,2023-09-13T11:41:01.000Z,"🇩🇰/ in Danish: In Denmark, we have decided to ..."
6108,vestas,und,🇩🇰: #dkbiz #dkpol #dkgreen https://t.co/9BpJhy...,2023-10-04T07:38:30.000Z,🇩🇰: #dkbiz #dkpol #dkgreen https://t.co/9BpJhy...
6115,vestas,da,🇩🇰 (in Danish): Tak for besøget på vores fabri...,2023-11-10T08:47:32.000Z,🇩🇰 (in Danish): Thank you for visiting our fac...


In [53]:
for index, row in no_english.iterrows():
    if index in df.index:
        df.loc[index, 'text'] = row['en_text']
df

Unnamed: 0,username,lang,text,created_at,en_text
0,abbgroupnews,en,New ABB E-mobility home charging solution help...,2023-01-03T15:18:41.000Z,
1,abbgroupnews,en,ABB #technology to facilitate #energyefficienc...,2023-01-10T09:57:29.000Z,
2,abbgroupnews,en,Better decisions for a greener data center! Ho...,2023-01-11T10:01:06.000Z,
3,abbgroupnews,en,ABB Formula E Season 9 goes green in Mexico!\n...,2023-01-12T09:09:45.000Z,
4,abbgroupnews,en,The #ABB Decoded #podcast with Frank Muehlon i...,2023-01-13T09:41:43.000Z,
...,...,...,...,...,...
6125,vestas,en,🌎: 2023 was the hottest year in recorded histo...,2023-12-13T16:03:45.000Z,
6126,vestas,en,"🇪🇺: To deliver clean, affordable, and secure e...",2023-12-19T15:32:30.000Z,
6127,vestas,en,🌎 /🇬🇧 : In the making: one of the largest offs...,2023-12-20T15:13:16.000Z,
6128,vestas,en,🌎/🇬🇧: Partnering with @RWE_AG to take the Nor...,2023-12-22T11:16:00.000Z,


In [59]:
df = df.drop(columns=['en_text'])
df

Unnamed: 0,username,lang,text,created_at
0,abbgroupnews,en,New ABB E-mobility home charging solution help...,2023-01-03T15:18:41.000Z
1,abbgroupnews,en,ABB #technology to facilitate #energyefficienc...,2023-01-10T09:57:29.000Z
2,abbgroupnews,en,Better decisions for a greener data center! Ho...,2023-01-11T10:01:06.000Z
3,abbgroupnews,en,ABB Formula E Season 9 goes green in Mexico!\n...,2023-01-12T09:09:45.000Z
4,abbgroupnews,en,The #ABB Decoded #podcast with Frank Muehlon i...,2023-01-13T09:41:43.000Z
...,...,...,...,...
6125,vestas,en,🌎: 2023 was the hottest year in recorded histo...,2023-12-13T16:03:45.000Z
6126,vestas,en,"🇪🇺: To deliver clean, affordable, and secure e...",2023-12-19T15:32:30.000Z
6127,vestas,en,🌎 /🇬🇧 : In the making: one of the largest offs...,2023-12-20T15:13:16.000Z
6128,vestas,en,🌎/🇬🇧: Partnering with @RWE_AG to take the Nor...,2023-12-22T11:16:00.000Z


In [60]:
df.to_csv('df_for_topic_modelling.csv')