## importing the loanwords file and cleaning it 

In [1]:
! pip install pandas openpyxl
! pip install pyarrow
!pip install nltk

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
Collecting pyarrow
  Downloading pyarrow-18.1.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading pyarrow-18.1.0-cp312-cp312-manylinux_2_28_x86_64.whl (40.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-18.1.0
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# Load the Excel file
df = pd.read_excel('Dkangliscismer.xlsx', skiprows=2)  # Replace with your file path

# Display the first few rows of the dataframe
print(df.head())

  Language         Domestic form Transliteration Domestic form variants  \
0   Danish             , anyone?             NaN                    NaN   
1   Danish              , du ved             NaN                    NaN   
2   Danish              , hej NN             NaN                    NaN   
3   Danish         , NP eller ej             NaN                    NaN   
4   Danish  , og + {main clause}             NaN                    NaN   

        Etymon Full form of acro/abbr    Type of borrowing       Definition  \
0       anyone                    NaN  unadapted borrowing              NaN   
1   , you know                    NaN     loan translation              NaN   
2      , hi NN                    NaN     loan translation  humorous insult   
3  , NP or not                    NaN     loan translation              NaN   
4          and                    NaN        semantic loan         and then   

            POS  Inflection    Phrasemic type  Gender Pronunciation  \
0  

  warn(msg)


In [4]:
# Remove characters '[]', '.', '+', '{}', 'NN', and numbers from the 'Domestic form' column
df['Domestic form'] = df['Domestic form'].str.replace(r'[\[\].\+\{\}\d]', '', regex=True)  # Remove the special characters and numbers
df['Domestic form'] = df['Domestic form'].str.replace('NN', '', regex=False)  # Remove 'NN'

# Display the updated DataFrame
print(df)


      Language      Domestic form Transliteration Domestic form variants  \
0       Danish          , anyone?             NaN                    NaN   
1       Danish           , du ved             NaN                    NaN   
2       Danish             , hej              NaN                    NaN   
3       Danish      , NP eller ej             NaN                    NaN   
4       Danish  , og  main clause             NaN                    NaN   
...        ...                ...             ...                    ...   
15647   Danish       ål og rejer!             NaN                    NaN   
15648   Danish     ånde  i nakken             NaN                    NaN   
15649   Danish    åndelig ejendom             NaN    åndelig ejendomsret   
15650   Danish  åndelig grusomhed             NaN                    NaN   
15651   Danish       år-til-dato-             NaN                    NaN   

                       Etymon Full form of acro/abbr    Type of borrowing  \
0         

In [5]:
# Specify the columns  to select
selected_columns = ['Domestic form', 'Domestic form variants', 'Etymon', 'Type of borrowing' , 'First attestation']  # Replace with actual column names

# Create a new DataFrame with only the selected columns
df_selected = df[selected_columns]

# Display the result
print(df_selected.head())

       Domestic form Domestic form variants       Etymon    Type of borrowing  \
0          , anyone?                    NaN       anyone  unadapted borrowing   
1           , du ved                    NaN   , you know     loan translation   
2             , hej                     NaN      , hi NN     loan translation   
3      , NP eller ej                    NaN  , NP or not     loan translation   
4  , og  main clause                    NaN          and        semantic loan   

   First attestation  
0             2006.0  
1                NaN  
2                NaN  
3                NaN  
4                NaN  


In [6]:
# Filter DataFrame
filtered_df = df_selected[
    (df['First attestation'].isna()) |  # Keep rows with no information (NaN)
    (df['First attestation'] >= 1945)  # Keep rows where value is 1945 or later
]

In [7]:
# Count rows in the original DataFrame
original_count = len(df_selected)

# Count rows in the filtered DataFrame
filtered_count = len(filtered_df)

# Calculate the number of removed rows
removed_rows = original_count - filtered_count

print(f"Number of rows removed: {removed_rows}")


Number of rows removed: 2008


In [8]:
# one for unadapted borrowings:

unadapted_borrowings = filtered_df[filtered_df['Type of borrowing'] == 'unadapted borrowing']

# Display the filtered DataFrame



In [9]:
print(unadapted_borrowings)

           Domestic form Domestic form variants               Etymon  \
0              , anyone?                    NaN               anyone   
6              , that is                    NaN           , that is.   
7          , The Musical                    NaN        , The Musical   
9        minutes of fame                    NaN   15 minutes of fame   
13     st century skills                    NaN  21st century skills   
...                  ...                    ...                  ...   
15592             zoomer                    NaN               zoomer   
15594            zorbing                    NaN              zorbing   
15595           zucchini                    NaN             zucchini   
15596             zydeco                    NaN               zydeco   
15597              Z-Z-Z                    NaN                Z-Z-Z   

         Type of borrowing  First attestation  
0      unadapted borrowing             2006.0  
6      unadapted borrowing             

In [10]:
selected_loan_types = filtered_df[filtered_df['Type of borrowing'].isin(['unadapted borrowing', 'hybrids', 'Pseudo‐Anglicisms'])]

# Display the filtered DataFrame
print(selected_loan_types)

           Domestic form Domestic form variants               Etymon  \
0              , anyone?                    NaN               anyone   
6              , that is                    NaN           , that is.   
7          , The Musical                    NaN        , The Musical   
9        minutes of fame                    NaN   15 minutes of fame   
13     st century skills                    NaN  21st century skills   
...                  ...                    ...                  ...   
15592             zoomer                    NaN               zoomer   
15594            zorbing                    NaN              zorbing   
15595           zucchini                    NaN             zucchini   
15596             zydeco                    NaN               zydeco   
15597              Z-Z-Z                    NaN                Z-Z-Z   

         Type of borrowing  First attestation  
0      unadapted borrowing             2006.0  
6      unadapted borrowing             

In [11]:
unadapted_borrowings.to_csv('unadapted_borrowings.csv', index=False)

In [12]:
selected_loan_types.to_csv('selected_loan_types.csv', index=False)