## Imports And Setup


In [9]:
import pandas as pd
from collections import Counter

from nltk.corpus import words
from nltk.tokenize import word_tokenize

In [10]:
df = pd.read_excel('EnglishTenseUniqueDataset.xlsx', engine='openpyxl')
df

Unnamed: 0,Sentence,Labels
0,by 2050 ai architects will have designed selfc...,Future
1,in the future sustainable transportation optio...,Future
2,china has been actively involved in peacekeepi...,Present
3,educational diversity is a hallmark of foreign...,Present
4,the coach substituted an underperforming player,Past
...,...,...
13311,he will become a good person,Future
13312,their door opens after eleven,Future
13313,my mother will cook delicious food,Future
13314,i am going to win this race,Future


# Sentences

In [11]:
df['has_digit'] = df['Sentence'].str.contains(r'\d', regex=True, na=False)

In [12]:
df = df.sort_values(by='has_digit', ascending=False)

In [13]:
has_digit_true_samples = df[df['has_digit'] == True].sample(500, random_state=1)['Sentence'].tolist()
has_digit_false_samples = df[df['has_digit'] == False].sample(500, random_state=1)['Sentence'].tolist()


In [None]:
with open('sentences.txt', 'w') as file:
    file.write("\n".join(has_digit_true_samples))
    file.write("\n".join(has_digit_false_samples))

In [None]:
# Some Stats
all_text = ''.join(df['Sentence'].astype(str)).lower()
counts = Counter(char for char in all_text if char.isalnum())

digit_counts = {k: v for k, v in counts.items() if k.isdigit()}
letter_counts = {k: v for k, v in counts.items() if k.isalpha()}

digit_frequency = pd.DataFrame(digit_counts.items(), columns=['Digit', 'Frequency']).sort_values(by='Digit')
letter_frequency = pd.DataFrame(letter_counts.items(), columns=['Letter', 'Frequency']).sort_values(by='Letter')

# Display the results
print("Digit Frequency:\n", digit_frequency)
print("\nLetter Frequency:\n", letter_frequency)


Digit Frequency:
   Digit  Frequency
1     0       1200
4     1        155
0     2       1051
7     3        324
3     4        368
2     5        212
5     6         87
8     7         81
9     8         83
6     9         90

Letter Frequency:
    Letter  Frequency
2       a      64873
0       b      14198
5       c      31926
13      d      29687
8       e     106230
16      f      16114
14      g      21181
6       h      31524
3       i      72394
21      j        784
24      k       4227
11      l      40809
19      m      20138
15      n      63741
17      o      54798
22      p      20081
25      q        797
4       r      53862
9       s      54837
7       t      68433
18      u      21395
12      v      13885
10      w      12506
20      x       2491
1       y      13062
23      z       1749


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13316 entries, 0 to 13315
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentence   13316 non-null  object
 1   Labels     13316 non-null  object
 2   has_digit  13316 non-null  bool  
dtypes: bool(1), object(2)
memory usage: 325.1+ KB


## FT Sentences

In [16]:
with open('sentences.txt', 'r') as f:
    exclude_sentences = set(f.read().splitlines())

In [17]:
remaining_sentences = df[~df['Sentence'].isin(exclude_sentences)]
remaining_sentences


Unnamed: 0,Sentence,Labels,has_digit
1797,biomedical text mining will have facilitated t...,Future,True
1823,by 2030 gene editing technology will have elim...,Future,True
11657,he completed a puzzle with over 1000 pieces,Past,True
8519,by 2070 centralized ict technologies will have...,Future,True
5318,by 2290 robotic surgery will have been integra...,Future,True
...,...,...,...
4645,the fans sang chants to rally behind their team,Past,False
4646,physicists are conducting experiments to explo...,Present,False
4647,language proficiency is assessed through langu...,Present,False
4648,many projects have incorporated green roofs pr...,Present,False


In [11]:
with open('ft_sentences.txt', 'w') as f:
    f.writelines('\n'.join(remaining_sentences['Sentence']))