# Use transformers to analysis sentiment
## Load the result from Ruiqiang Liu

In [1]:
import pandas as pd
import missingno as msno
df = pd.read_csv('./data/en_Tennis.csv')
df

Unnamed: 0,coordinates,location,text,created_at,lang,hashTags,cleaned_location,cleaned_text,translated_text
0,"38.3436365,-0.4881708","Alicante, Comunidad Valenciana",Alucinando como se está moviendo Garbiñe por l...,Sat Jul 15 14:19:06 +0000 2017,es,VamosGarbiñe Wimbledon,"Alicante, Comunidad Valenciana",Alucinando como se está moviendo Garbiñe por l...,Hallucinating how Garbiñe is moving along the ...
1,"40.4167047,-3.7035825",Madrid,Vamos Muguruza queda poco! #WimbledonMovistar,Sat Jul 15 14:19:08 +0000 2017,es,WimbledonMovistar,Madrid,Vamos Muguruza queda poco! WimbledonMovistar,Come on Muguruza is little left! #WimbledonMov...
2,"42.34461,-3.70051",León - Burgos - Philippines,Enjoying #wimbledon final @ Varacafé https://t...,Sat Jul 15 14:19:09 +0000 2017,en,wimbledon,León - Burgos - Philippines,Enjoying wimbledon final Varacafé,Enjoying wimbledon final Varacafé
3,"51.4893335,-0.1440551",London,It's a little like watching the Murray QF agai...,Sat Jul 15 14:19:10 +0000 2017,en,vamosGarbi Wimbledon,London,It's a little like watching the Murray QF agai...,It's a little like watching the Murray QF agai...
4,"51.309559,-0.0557157","Warlingham,Surrey",@GarbiMuguruza must surely now have one hand 🤚...,Sat Jul 15 14:19:11 +0000 2017,en,Wimbledon,"Warlingham, Surrey",must surely now have one hand on the trophy W...,must surely now have one hand on the trophy W...
...,...,...,...,...,...,...,...,...,...
10159,"53.4071991,-2.99168",Liverpool,Is I normal to sweat this much whilst playing ...,Tue Jul 18 22:56:13 +0000 2017,en,,Liverpool,Is I normal to sweat this much whilst playing ...,Is I normal to sweat this much whilst playing ...
10160,"51.4194864,-0.2056131",London,Ahh summer isn't here until you have a Pina Co...,Tue Jul 18 22:56:37 +0000 2017,en,,London,Ahh summer isn't here until you have a Pina Co...,Ahh summer isn't here until you have a Pina Co...
10161,"45.1941383,9.8663353",grumello cremonese cr,"""La forza è anche quella di saper armonizzare ...",Tue Jul 18 22:58:43 +0000 2017,it,Federer,grumello cremonese cr,"""La forza è anche quella di saper armonizzare ...","""The strength is also to know how to harmonize..."
10162,"42.6384261,12.674297",Italia,"Non saprei, ma so che gli piace giocare a tenn...",Tue Jul 18 23:19:11 +0000 2017,it,,Italia,"Non saprei, ma so che gli piace giocare a tennis","I don't know, but I know he likes to play tenn..."


## Sentimnet Analysis based on ENglish Text

In [2]:
import os
os.environ["HF_HOME"] = "./cache"
cache_dir = "./cache"

In [None]:
from transformers import pipeline

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
classifier = pipeline("sentiment-analysis", checkpoint, device =0)

def safe_classify(text):
    if pd.isna(text) or text == '' or not isinstance(text, str):
        return {'label': 'UNKNOWN', 'score': 0.0}
    try:
        result = classifier([text])[0]
        return result
    except Exception as e:
        print(f"分类错误: {e}")
        return {'label': 'ERROR', 'score': 0.0}
    
results = df['translated_text'].apply(safe_classify)
# Extract confidence scores and labels into new columns
df['bert_conf'] = results.apply(lambda x: x['score'])
df['bert_sentiment'] = results.apply(lambda x: x['label'])


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [5]:
df
#msno.matrix(df)
df.to_csv('./data/Tennis_sst2.csv', index=False, encoding='UTF-8')

## Filter by the time
1. Before the Final: 2017.06.03 20:00
2. After the Final

In [6]:
from datetime import datetime, date
from time_format import convert_twitter_time
def add_gametime_column(df, reference_time = "2017-07-16 16:30" ):
    """
    Compare df['created_at'] with the reference time (2017.06.03 20:00)
    and add a 'gametime' column with values 'before' or 'after'
    
    Args:
        df: DataFrame containing 'created_at' column with Twitter timestamps
        
    Returns:
        DataFrame with added 'gametime' column
    """
    # Define reference time
    
    reference_dt = datetime.strptime(reference_time, "%Y-%m-%d %H:%M")
    
    # Convert created_at to datetime and compare with reference time
    df['gametime'] = df['created_at'].apply(
        lambda x: 'before' if datetime.strptime(convert_twitter_time(x, "%Y-%m-%d %H:%M"), 
                                              "%Y-%m-%d %H:%M") < reference_dt else 'after'
    )
    
    return df

import pandas as pd
import missingno as msno
df = pd.read_csv('./data/Tennis_sst2.csv')
df = add_gametime_column(df)
df

Unnamed: 0,coordinates,location,text,created_at,lang,hashTags,cleaned_location,cleaned_text,translated_text,bert_conf,bert_sentiment,gametime
0,"38.3436365,-0.4881708","Alicante, Comunidad Valenciana",Alucinando como se está moviendo Garbiñe por l...,Sat Jul 15 14:19:06 +0000 2017,es,VamosGarbiñe Wimbledon,"Alicante, Comunidad Valenciana",Alucinando como se está moviendo Garbiñe por l...,Hallucinating how Garbiñe is moving along the ...,0.981437,POSITIVE,before
1,"40.4167047,-3.7035825",Madrid,Vamos Muguruza queda poco! #WimbledonMovistar,Sat Jul 15 14:19:08 +0000 2017,es,WimbledonMovistar,Madrid,Vamos Muguruza queda poco! WimbledonMovistar,Come on Muguruza is little left! #WimbledonMov...,0.995477,NEGATIVE,before
2,"42.34461,-3.70051",León - Burgos - Philippines,Enjoying #wimbledon final @ Varacafé https://t...,Sat Jul 15 14:19:09 +0000 2017,en,wimbledon,León - Burgos - Philippines,Enjoying wimbledon final Varacafé,Enjoying wimbledon final Varacafé,0.999035,POSITIVE,before
3,"51.4893335,-0.1440551",London,It's a little like watching the Murray QF agai...,Sat Jul 15 14:19:10 +0000 2017,en,vamosGarbi Wimbledon,London,It's a little like watching the Murray QF agai...,It's a little like watching the Murray QF agai...,0.999396,NEGATIVE,before
4,"51.309559,-0.0557157","Warlingham,Surrey",@GarbiMuguruza must surely now have one hand 🤚...,Sat Jul 15 14:19:11 +0000 2017,en,Wimbledon,"Warlingham, Surrey",must surely now have one hand on the trophy W...,must surely now have one hand on the trophy W...,0.887019,NEGATIVE,before
...,...,...,...,...,...,...,...,...,...,...,...,...
10159,"53.4071991,-2.99168",Liverpool,Is I normal to sweat this much whilst playing ...,Tue Jul 18 22:56:13 +0000 2017,en,,Liverpool,Is I normal to sweat this much whilst playing ...,Is I normal to sweat this much whilst playing ...,0.998403,NEGATIVE,after
10160,"51.4194864,-0.2056131",London,Ahh summer isn't here until you have a Pina Co...,Tue Jul 18 22:56:37 +0000 2017,en,,London,Ahh summer isn't here until you have a Pina Co...,Ahh summer isn't here until you have a Pina Co...,0.935683,NEGATIVE,after
10161,"45.1941383,9.8663353",grumello cremonese cr,"""La forza è anche quella di saper armonizzare ...",Tue Jul 18 22:58:43 +0000 2017,it,Federer,grumello cremonese cr,"""La forza è anche quella di saper armonizzare ...","""The strength is also to know how to harmonize...",0.999818,POSITIVE,after
10162,"42.6384261,12.674297",Italia,"Non saprei, ma so che gli piace giocare a tenn...",Tue Jul 18 23:19:11 +0000 2017,it,,Italia,"Non saprei, ma so che gli piace giocare a tennis","I don't know, but I know he likes to play tenn...",0.993368,POSITIVE,after


In [7]:
threshold = 0.7  # You can adjust this value as needed
df['bert_sentiment3'] = df ['bert_sentiment']
df.loc[df['bert_conf'] < threshold, 'bert_sentiment3'] = 'NEUTRAL'
df

Unnamed: 0,coordinates,location,text,created_at,lang,hashTags,cleaned_location,cleaned_text,translated_text,bert_conf,bert_sentiment,gametime,bert_sentiment3
0,"38.3436365,-0.4881708","Alicante, Comunidad Valenciana",Alucinando como se está moviendo Garbiñe por l...,Sat Jul 15 14:19:06 +0000 2017,es,VamosGarbiñe Wimbledon,"Alicante, Comunidad Valenciana",Alucinando como se está moviendo Garbiñe por l...,Hallucinating how Garbiñe is moving along the ...,0.981437,POSITIVE,before,POSITIVE
1,"40.4167047,-3.7035825",Madrid,Vamos Muguruza queda poco! #WimbledonMovistar,Sat Jul 15 14:19:08 +0000 2017,es,WimbledonMovistar,Madrid,Vamos Muguruza queda poco! WimbledonMovistar,Come on Muguruza is little left! #WimbledonMov...,0.995477,NEGATIVE,before,NEGATIVE
2,"42.34461,-3.70051",León - Burgos - Philippines,Enjoying #wimbledon final @ Varacafé https://t...,Sat Jul 15 14:19:09 +0000 2017,en,wimbledon,León - Burgos - Philippines,Enjoying wimbledon final Varacafé,Enjoying wimbledon final Varacafé,0.999035,POSITIVE,before,POSITIVE
3,"51.4893335,-0.1440551",London,It's a little like watching the Murray QF agai...,Sat Jul 15 14:19:10 +0000 2017,en,vamosGarbi Wimbledon,London,It's a little like watching the Murray QF agai...,It's a little like watching the Murray QF agai...,0.999396,NEGATIVE,before,NEGATIVE
4,"51.309559,-0.0557157","Warlingham,Surrey",@GarbiMuguruza must surely now have one hand 🤚...,Sat Jul 15 14:19:11 +0000 2017,en,Wimbledon,"Warlingham, Surrey",must surely now have one hand on the trophy W...,must surely now have one hand on the trophy W...,0.887019,NEGATIVE,before,NEGATIVE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10159,"53.4071991,-2.99168",Liverpool,Is I normal to sweat this much whilst playing ...,Tue Jul 18 22:56:13 +0000 2017,en,,Liverpool,Is I normal to sweat this much whilst playing ...,Is I normal to sweat this much whilst playing ...,0.998403,NEGATIVE,after,NEGATIVE
10160,"51.4194864,-0.2056131",London,Ahh summer isn't here until you have a Pina Co...,Tue Jul 18 22:56:37 +0000 2017,en,,London,Ahh summer isn't here until you have a Pina Co...,Ahh summer isn't here until you have a Pina Co...,0.935683,NEGATIVE,after,NEGATIVE
10161,"45.1941383,9.8663353",grumello cremonese cr,"""La forza è anche quella di saper armonizzare ...",Tue Jul 18 22:58:43 +0000 2017,it,Federer,grumello cremonese cr,"""La forza è anche quella di saper armonizzare ...","""The strength is also to know how to harmonize...",0.999818,POSITIVE,after,POSITIVE
10162,"42.6384261,12.674297",Italia,"Non saprei, ma so che gli piace giocare a tenn...",Tue Jul 18 23:19:11 +0000 2017,it,,Italia,"Non saprei, ma so che gli piace giocare a tennis","I don't know, but I know he likes to play tenn...",0.993368,POSITIVE,after,POSITIVE


In [8]:
df.to_csv('./data/Tennis_sst2.csv', index=False, encoding='UTF-8')